ktstr 0.4.14

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
//! vCPU exit classification and per-arch I/O dispatch.
//!
//! Shared between BSP and AP run loops. Each exit gets classified into
//! [`ExitAction`] (Continue / Shutdown / Fatal); arch-specific I/O is
//! dispatched inline so the surrounding loop only sees the action.
//!
//! - x86_64: serial via port I/O ([`dispatch_io_out`] / [`dispatch_io_in`]),
//!   virtio-console via MMIO inside [`classify_exit`], i8042 reset for reboot.
//! - aarch64: serial + virtio-console both via MMIO ([`dispatch_mmio_write`]
//!   / [`dispatch_mmio_read`]).

use crate::vmm::PiMutex;
use crate::vmm::vcpu::{SCX_EXIT_ERROR_THRESHOLD, WatchpointArm, self_arm_watchpoint};
use crate::vmm::{console, kvm, virtio_blk, virtio_console, virtio_net};
use kvm_ioctls::VcpuExit;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use vmm_sys_util::eventfd::EventFd;

/// Snapshot of a vCPU's architectural state, captured by the vCPU
/// thread itself at freeze time (just before it parks). Surfaced in
/// the failure-dump report so an operator can correlate the observed
/// guest-memory state with where each vCPU was executing.
///
/// Field naming is arch-neutral: each value is set from the matching
/// per-arch register so the layout is identical across x86_64 and
/// aarch64 in JSON / Display output.
///
/// Capture must run ON the vCPU thread (not cross-thread) because
/// `KVM_GET_REGS` / `KVM_GET_SREGS` are vCPU-fd-bound ioctls; their
/// thread-affinity is a KVM API contract documented in the kernel
/// vCPU lifecycle (Documentation/virt/kvm/api.rst). Calling them
/// from a different thread reads stale state at best and races KVM's
/// internal vCPU state machine at worst.
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
#[non_exhaustive]
pub struct VcpuRegSnapshot {
    /// Instruction pointer at freeze time (`rip` on x86_64,
    /// `pc` on aarch64). Identifies the kernel/userspace function
    /// the vCPU was executing when the freeze coordinator's kick
    /// arrived.
    pub instruction_pointer: u64,
    /// Kernel-side stack pointer at freeze time (`rsp` on x86_64,
    /// `sp_el1` on aarch64 — explicitly NOT `sp_el0`, which is the
    /// userspace stack). Captures the EL1/CPL0 stack frame an
    /// operator can unwind against the BPF map dump for sched_ext
    /// failures, which fire in kernel context.
    pub stack_pointer: u64,
    /// Page-table root at freeze time. Captures arch-specific
    /// kernel-side state suitable for correlating the BPF map
    /// dump with the active address space:
    ///
    ///   - On x86_64: `cr3` — per-process pgd. Distinct from
    ///     [`crate::monitor::guest::GuestKernel::cr3_pa`], which
    ///     captures the boot-time `init_top_pgt` at coordinator
    ///     start. This snapshot field reflects whatever pgd the
    ///     vCPU was running on at freeze time (typically the
    ///     current task's mm); the boot-time value is what the
    ///     freeze coordinator uses for its own page-walks.
    ///
    ///   - On aarch64: `ttbr1_el1` — the kernel pgd. Stays
    ///     stable across context switches (TTBR0_EL1 swaps
    ///     per-task; see [`Self::user_page_table_root`] for the
    ///     userspace half).
    ///
    /// Raw register value with arch-specific flag bits intact
    /// (PCID/PCD/PWT on x86_64 CR3, ASID on aarch64 TTBR);
    /// consumers must mask before walking as a physical address.
    pub page_table_root: u64,
    /// Userspace page-table root at freeze time. arch-specific:
    ///
    ///   - On x86_64: always `None`. CR3 already covers both the
    ///     kernel and userspace halves of the address space —
    ///     [`Self::page_table_root`] alone identifies the active
    ///     mm.
    ///
    ///   - On aarch64: `Some(ttbr0_el1)` when capture succeeds,
    ///     `None` when KVM_GET_ONE_REG fails (mid-shutdown vCPU,
    ///     sysreg gated by the host kernel). TTBR0_EL1 holds the
    ///     userspace pgd that switches per-task, so it
    ///     identifies which task's userspace was active at
    ///     freeze time — useful for diagnosing user-context
    ///     traps. For sched_ext failures (kernel context),
    ///     TTBR1_EL1 in `page_table_root` is the primary signal.
    ///
    /// Raw register value with arch-specific flag bits intact
    /// (PCID/PCD/PWT on x86_64 CR3, ASID on aarch64 TTBR);
    /// consumers must mask before walking as a physical address.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub user_page_table_root: Option<u64>,
    /// aarch64 TCR_EL1 register at freeze time. Drives the
    /// granule-agnostic page-table walker
    /// ([`crate::monitor::reader::GuestMem::translate_kva`]):
    /// TG1 bits [31:30] select the high-half granule (4 KB / 16 KB
    /// / 64 KB) and T1SZ bits [21:16] determine the high-half VA
    /// width (`64 - T1SZ`). Stable after kernel MMU bring-up.
    /// `None` on x86_64 (the register does not exist) and on
    /// aarch64 if the KVM_GET_ONE_REG read fails mid-shutdown.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tcr_el1: Option<u64>,
}

/// Capture the vCPU's RIP/RSP/CR3 (or PC/SP/TTBR1 on aarch64) on
/// the calling thread. Invoked from `handle_freeze` after the drain
/// dance and before the `parked = true` Release store, so the
/// values reach the freeze coordinator via the same happens-before
/// edge the coordinator relies on for guest-memory reads.
///
/// `None` on capture failure — the get_regs / get_one_reg ioctls
/// can fail mid-shutdown when KVM has begun tearing down the vCPU.
/// The caller stores `None` into the per-vCPU slot in that case;
/// the dump reflects "registers unavailable" rather than panicking
/// the freeze path.
#[cfg(target_arch = "x86_64")]
pub(crate) fn capture_vcpu_regs(vcpu: &mut kvm_ioctls::VcpuFd) -> Option<VcpuRegSnapshot> {
    let regs = vcpu.get_regs().ok()?;
    let sregs = vcpu.get_sregs().ok()?;
    Some(VcpuRegSnapshot {
        instruction_pointer: regs.rip,
        stack_pointer: regs.rsp,
        page_table_root: sregs.cr3,
        // x86_64 has a single CR3 covering both halves of the
        // address space; no separate userspace pgd to capture.
        user_page_table_root: None,
        // TCR_EL1 is an aarch64 register; not present on x86_64.
        tcr_el1: None,
    })
}

#[cfg(target_arch = "aarch64")]
pub(crate) fn capture_vcpu_regs(vcpu: &mut kvm_ioctls::VcpuFd) -> Option<VcpuRegSnapshot> {
    // ARM core register IDs encode
    // `(offsetof(struct kvm_regs, field) / sizeof(u32))` in the low
    // bits, OR'd with KVM_REG_ARM64 + KVM_REG_SIZE_U64 +
    // KVM_REG_ARM_CORE (per kernel uapi/asm/kvm.h
    // `KVM_REG_ARM_CORE_REG` macro). The offset is into
    // `struct kvm_regs`, NOT directly into `struct user_pt_regs`;
    // the two coincide for the first 272 bytes because
    // `kvm_regs.regs` is at offset 0, but adding fields past
    // `user_pt_regs` (e.g. `sp_el1` below) requires the
    // `kvm_regs`-relative encoding.
    //
    // struct kvm_regs (arch/arm64/include/uapi/asm/kvm.h):
    //   struct user_pt_regs regs;     // offset 0..272
    //     u64 regs[31];               //   offset   0..248
    //     u64 sp;       (= sp_el0)    //   offset 248
    //     u64 pc;                     //   offset 256
    //     u64 pstate;                 //   offset 264
    //   u64 sp_el1;                   // offset 272
    //   u64 elr_el1;                  // offset 280
    //   u64 spsr[KVM_NR_SPSR];        // offset 288..
    //   ...
    //
    // The kernel-side stack pointer is `sp_el1`, NOT `regs.sp`
    // (which is `sp_el0` — the userspace stack pointer per the
    // comment at arch/arm64/include/uapi/asm/kvm.h:47). sched_ext
    // exits fire in EL1 (kernel context), so capturing sp_el1
    // yields the kernel stack frame an operator can unwind
    // against the BPF map dump. Capturing sp_el0 here would
    // leak the userspace stack of whatever task happened to be
    // current — typically irrelevant for kernel-side debugging
    // and confusing in the JSON output.
    //
    // Each u32 step is +1 in the encoded ID.
    const KVM_REG_ARM64: u64 = 0x6000_0000_0000_0000;
    const KVM_REG_SIZE_U64: u64 = 0x0030_0000_0000_0000;
    const KVM_REG_ARM_CORE: u64 = 0x0010_0000;
    // SP_EL1 lives at offset 272 in struct kvm_regs (right after
    // the 272-byte user_pt_regs). 272 / 4 = 68.
    const SP_EL1_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | (272 / 4);
    // PC at offset 256 in user_pt_regs (= same offset in kvm_regs
    // because user_pt_regs.regs is at offset 0). 256 / 4 = 64.
    const PC_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | (256 / 4);
    // ARM64 system registers encoded under KVM_REG_ARM64_SYSREG.
    // The 16-bit packing is
    //   (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2
    // per arch/arm64/include/uapi/asm/kvm.h `ARM64_SYS_REG` macro.
    const KVM_REG_ARM64_SYSREG: u64 = 0x0013_0000;
    // TTBR0_EL1: Op0=3, Op1=0, CRn=2, CRm=0, Op2=0
    // = (3 << 14) | (0 << 11) | (2 << 7) | (0 << 3) | 0 = 0xC100
    const TTBR0_EL1_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC100;
    // TTBR1_EL1: Op0=3, Op1=0, CRn=2, CRm=0, Op2=1
    // = (3 << 14) | (0 << 11) | (2 << 7) | (0 << 3) | 1 = 0xC101
    const TTBR1_EL1_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC101;
    // TCR_EL1: Op0=3, Op1=0, CRn=2, CRm=0, Op2=2
    // = (3 << 14) | (0 << 11) | (2 << 7) | (0 << 3) | 2 = 0xC102
    const TCR_EL1_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC102;

    let mut buf = [0u8; 8];
    let pc = vcpu
        .get_one_reg(PC_ID, &mut buf)
        .ok()
        .map(|_| u64::from_le_bytes(buf))?;
    let sp = vcpu
        .get_one_reg(SP_EL1_ID, &mut buf)
        .ok()
        .map(|_| u64::from_le_bytes(buf))?;
    // TTBR1 read is best-effort; some kernels gate sysreg access.
    // A failure leaves page_table_root = 0 — the boot-time
    // GuestKernel::cr3_pa is still available to the dump.
    let ttbr1 = vcpu
        .get_one_reg(TTBR1_EL1_ID, &mut buf)
        .ok()
        .map(|_| u64::from_le_bytes(buf))
        .unwrap_or(0);
    // TTBR0 read is best-effort. Stored in user_page_table_root
    // so a failure surfaces as None — distinct from a successful
    // read of 0, which means "no userspace mapping active at
    // freeze time" (e.g. the vCPU was running pure kernel code).
    let ttbr0 = vcpu
        .get_one_reg(TTBR0_EL1_ID, &mut buf)
        .ok()
        .map(|_| u64::from_le_bytes(buf));
    // TCR_EL1 carries the granule (TG1[31:30]) and high-half VA
    // size (T1SZ[21:16]) the page-table walker needs. Best-effort:
    // a failure leaves None and the walker falls back to the
    // boot-time cached value the freeze coordinator latched at
    // GuestKernel construction.
    let tcr_el1 = vcpu
        .get_one_reg(TCR_EL1_ID, &mut buf)
        .ok()
        .map(|_| u64::from_le_bytes(buf));
    Some(VcpuRegSnapshot {
        instruction_pointer: pc,
        stack_pointer: sp,
        page_table_root: ttbr1,
        user_page_table_root: ttbr0,
        tcr_el1,
    })
}

/// Read TCR_EL1 directly from a vCPU. Used at GuestKernel
/// construction time to feed the page-table walker its granule and
/// VA-width settings (TG1 in bits [31:30], T1SZ in bits [21:16]).
///
/// Returns `None` on x86_64 (the register does not exist) and on
/// aarch64 if `KVM_GET_ONE_REG` fails. The caller treats `None` as
/// "no walker context yet"; on aarch64 that surfaces as a 0 stored
/// in the GuestKernel's `tcr_el1` field — the walker rejects T1SZ=0
/// and the affected lookups skip cleanly.
#[cfg(target_arch = "x86_64")]
pub(crate) fn read_tcr_el1(_vcpu: &mut kvm_ioctls::VcpuFd) -> Option<u64> {
    None
}

#[cfg(target_arch = "aarch64")]
pub(crate) fn read_tcr_el1(vcpu: &mut kvm_ioctls::VcpuFd) -> Option<u64> {
    // Same encoding constants as `capture_vcpu_regs`. TCR_EL1 packs
    // to (Op0=3, Op1=0, CRn=2, CRm=0, Op2=2) under the
    // KVM_REG_ARM64_SYSREG namespace.
    const KVM_REG_ARM64: u64 = 0x6000_0000_0000_0000;
    const KVM_REG_SIZE_U64: u64 = 0x0030_0000_0000_0000;
    const KVM_REG_ARM64_SYSREG: u64 = 0x0013_0000;
    const TCR_EL1_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC102;
    let mut buf = [0u8; 8];
    vcpu.get_one_reg(TCR_EL1_ID, &mut buf)
        .ok()
        .map(|_| u64::from_le_bytes(buf))
}

/// Read the BSP's kernel-half page-table root directly from a vCPU.
/// On x86_64 this is `CR3` from `KVM_GET_SREGS`; on aarch64 it is
/// `TTBR1_EL1` from `KVM_GET_ONE_REG`.
///
/// Used by the BSP loop's lazy-CAS to populate the
/// [`super::freeze_coord::run_bsp_loop`] cr3 cache once the guest
/// kernel has installed its post-randomization page tables. The
/// monitor and BPF map writer threads consume the cached value to
/// resolve `phys_base` via a page-table walk that breaks the
/// chicken-and-egg with text-symbol PA translation.
///
/// Returns `None` on KVM_GET_SREGS / KVM_GET_ONE_REG failure
/// (transient EINTR mid-shutdown); callers retry on the next
/// iteration. A successful return of `0` means the kernel has not
/// yet installed its page tables (very-early boot before
/// `__startup_64` / `__cpu_setup`); callers MUST gate the CAS on a
/// non-zero value so a stale `0` does not displace a previously
/// latched non-zero CR3.
#[cfg(target_arch = "x86_64")]
pub(crate) fn read_cr3(vcpu: &mut kvm_ioctls::VcpuFd) -> Option<u64> {
    vcpu.get_sregs().ok().map(|s| s.cr3)
}

#[cfg(target_arch = "aarch64")]
pub(crate) fn read_cr3(vcpu: &mut kvm_ioctls::VcpuFd) -> Option<u64> {
    // TTBR1_EL1 holds the kernel-half page-table base (matches the
    // `page_table_root` field in `VcpuRegSnapshot`). Same encoding
    // as `capture_vcpu_regs`: (Op0=3, Op1=0, CRn=2, CRm=0, Op2=1)
    // under the KVM_REG_ARM64_SYSREG namespace.
    const KVM_REG_ARM64: u64 = 0x6000_0000_0000_0000;
    const KVM_REG_SIZE_U64: u64 = 0x0030_0000_0000_0000;
    const KVM_REG_ARM64_SYSREG: u64 = 0x0013_0000;
    const TTBR1_EL1_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC101;
    let mut buf = [0u8; 8];
    vcpu.get_one_reg(TTBR1_EL1_ID, &mut buf)
        .ok()
        .map(|_| u64::from_le_bytes(buf))
}

impl std::fmt::Display for VcpuRegSnapshot {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "ip=0x{:016x} sp=0x{:016x} ptroot=0x{:016x}",
            self.instruction_pointer, self.stack_pointer, self.page_table_root
        )?;
        // user_page_table_root is x86_64-None / aarch64-Optional;
        // when present, render it inline so an aarch64 failure
        // dump shows both halves of the address space.
        if let Some(uptr) = self.user_page_table_root {
            write!(f, " uptroot=0x{uptr:016x}")?;
        }
        Ok(())
    }
}

// ---------------------------------------------------------------------------
// aarch64 MMIO dispatch — serial and virtio over MMIO
// ---------------------------------------------------------------------------

/// Dispatch an MMIO write to serial and virtio devices.
///
/// aarch64 reboot is signalled via PSCI (`VcpuExit::SystemEvent`), not
/// MMIO, so there is no shutdown return distinct from the normal write
/// path: this function always handles the write and returns to the
/// run loop.
#[cfg(target_arch = "aarch64")]
#[allow(clippy::too_many_arguments)]
pub(crate) fn dispatch_mmio_write(
    com1: &PiMutex<console::Serial>,
    com2: &PiMutex<console::Serial>,
    virtio_con: Option<&PiMutex<virtio_console::VirtioConsole>>,
    virtio_blk: Option<&PiMutex<virtio_blk::VirtioBlk>>,
    virtio_net: Option<&PiMutex<virtio_net::VirtioNet>>,
    addr: u64,
    data: &[u8],
) {
    if let Some(offset) = mmio_serial_offset(addr, kvm::SERIAL_MMIO_BASE) {
        if let Some(&byte) = data.first() {
            com1.lock().inner_write(offset, byte);
        }
    } else if let Some(offset) = mmio_serial_offset(addr, kvm::SERIAL2_MMIO_BASE)
        && let Some(&byte) = data.first()
    {
        com2.lock().inner_write(offset, byte);
    } else if let Some(vc) = virtio_con
        && (kvm::VIRTIO_CONSOLE_MMIO_BASE
            ..kvm::VIRTIO_CONSOLE_MMIO_BASE + virtio_console::VIRTIO_MMIO_SIZE)
            .contains(&addr)
    {
        vc.lock()
            .mmio_write(addr - kvm::VIRTIO_CONSOLE_MMIO_BASE, data);
    } else if let Some(vb) = virtio_blk
        && (kvm::VIRTIO_BLK_MMIO_BASE..kvm::VIRTIO_BLK_MMIO_BASE + virtio_blk::VIRTIO_MMIO_SIZE)
            .contains(&addr)
    {
        vb.lock().mmio_write(addr - kvm::VIRTIO_BLK_MMIO_BASE, data);
    } else if let Some(vn) = virtio_net
        && (kvm::VIRTIO_NET_MMIO_BASE..kvm::VIRTIO_NET_MMIO_BASE + virtio_net::VIRTIO_MMIO_SIZE)
            .contains(&addr)
    {
        vn.lock().mmio_write(addr - kvm::VIRTIO_NET_MMIO_BASE, data);
    }
}

/// Dispatch an MMIO read from serial and virtio-console devices.
#[cfg(target_arch = "aarch64")]
#[allow(clippy::too_many_arguments)]
pub(crate) fn dispatch_mmio_read(
    com1: &PiMutex<console::Serial>,
    com2: &PiMutex<console::Serial>,
    virtio_con: Option<&PiMutex<virtio_console::VirtioConsole>>,
    virtio_blk: Option<&PiMutex<virtio_blk::VirtioBlk>>,
    virtio_net: Option<&PiMutex<virtio_net::VirtioNet>>,
    addr: u64,
    data: &mut [u8],
) {
    if let Some(offset) = mmio_serial_offset(addr, kvm::SERIAL_MMIO_BASE) {
        if let Some(first) = data.first_mut() {
            *first = com1.lock().inner_read(offset);
        }
    } else if let Some(offset) = mmio_serial_offset(addr, kvm::SERIAL2_MMIO_BASE) {
        if let Some(first) = data.first_mut() {
            *first = com2.lock().inner_read(offset);
        }
    } else if let Some(vc) = virtio_con
        && (kvm::VIRTIO_CONSOLE_MMIO_BASE
            ..kvm::VIRTIO_CONSOLE_MMIO_BASE + virtio_console::VIRTIO_MMIO_SIZE)
            .contains(&addr)
    {
        vc.lock()
            .mmio_read(addr - kvm::VIRTIO_CONSOLE_MMIO_BASE, data);
    } else if let Some(vb) = virtio_blk
        && (kvm::VIRTIO_BLK_MMIO_BASE..kvm::VIRTIO_BLK_MMIO_BASE + virtio_blk::VIRTIO_MMIO_SIZE)
            .contains(&addr)
    {
        vb.lock().mmio_read(addr - kvm::VIRTIO_BLK_MMIO_BASE, data);
    } else if let Some(vn) = virtio_net
        && (kvm::VIRTIO_NET_MMIO_BASE..kvm::VIRTIO_NET_MMIO_BASE + virtio_net::VIRTIO_MMIO_SIZE)
            .contains(&addr)
    {
        vn.lock().mmio_read(addr - kvm::VIRTIO_NET_MMIO_BASE, data);
    } else {
        for b in data.iter_mut() {
            *b = 0xff;
        }
    }
}

/// Compute register offset for an MMIO address within a serial region.
///
/// The serial MMIO region is page-sized (`kvm::SERIAL_MMIO_SIZE` =
/// 0x1000) so each UART sits on its own guest page, but the
/// underlying ns16550a register window is only 8 bytes wide
/// (DATA/IER/IIR/LCR/MCR/LSR/MSR/SCR at offsets 0..=7 per the
/// `vm-superio` `Serial::{read,write}` switch arms; out-of-range
/// offsets fall through to `_ => 0` / `_ => {}`). The legal-input
/// bound here is therefore the `u8` representable range, not the
/// page size: an offset above 0xFF cannot fit in `u8` and the cast
/// would silently wrap modulo 256 — a guest write to register 0x100
/// would land at register 0x00 (DATA), corrupting the TX path.
///
/// Tightening the upper bound to `base + 256` returns `None` for
/// the [0x100, SERIAL_MMIO_SIZE) sub-region, which falls through
/// to the next dispatch arm (and ultimately to the unmapped-MMIO
/// 0xFF fill on read / silent drop on write — the correct "no
/// device here" semantics for the unused tail of the page). The
/// kernel's 8250 driver only emits accesses to offsets 0..=7, so
/// no production guest hits the > 0x100 region; this gate is a
/// hostile-guest defense against truncation-induced register
/// aliasing.
#[cfg(target_arch = "aarch64")]
fn mmio_serial_offset(addr: u64, base: u64) -> Option<u8> {
    // Bound is `u8::MAX as u64 + 1` (= 256) so the `as u8` cast
    // is total within the kept range. The page-sized
    // `SERIAL_MMIO_SIZE` is intentionally NOT used as the bound;
    // see the rationale on this function.
    const MAX_REG_OFFSET: u64 = u8::MAX as u64 + 1;
    // Compile-time guarantee: the region we accept fits inside the
    // declared MMIO window. If a future change shrinks
    // `SERIAL_MMIO_SIZE` below 256, this constant breaks the
    // build instead of letting the function admit offsets that
    // step into a neighbouring device's region.
    const _: () = assert!(
        kvm::SERIAL_MMIO_SIZE >= MAX_REG_OFFSET,
        "SERIAL_MMIO_SIZE must cover at least the 256-byte u8-representable \
         register window mmio_serial_offset accepts"
    );
    if addr >= base && addr < base + MAX_REG_OFFSET {
        Some((addr - base) as u8)
    } else {
        None
    }
}

// -- watchpoint hit dispatch ------------------------------------------
//
// Shared between the AP (`vcpu_run_loop_unified`) and BSP
// (`run_bsp_loop`) paths. Identifies which watchpoint slot fired
// from the per-arch `kvm_debug_exit_arch` payload, gates the slot-0
// trigger on the post-store `exit_kind` value (so a clean
// SCX_EXIT_DONE does not generate a failure dump), and latches the
// matched user slot for the freeze coordinator's epoll loop.

/// `ESR_ELx_EC` decoded value for a watchpoint exception taken from a
/// lower exception level (the only EC that surfaces guest-side data
/// watchpoint hits to userspace via `KVM_EXIT_DEBUG`). Pinned per
/// `arch/arm64/include/asm/esr.h` `ESR_ELx_EC_WATCHPT_LOW = 0x34`.
///
/// `WATCHPT_CUR` (EC=0x35, watchpoint taken at the current EL) is
/// not handled because the kernel's `arm_exit_handlers` table in
/// `arch/arm64/kvm/handle_exit.c` (which routes guest exceptions to
/// userspace via `KVM_EXIT_DEBUG`) only registers a handler for
/// `ESR_ELx_EC_WATCHPT_LOW` — `WATCHPT_CUR` has no entry and is
/// therefore never surfaced to userspace. The guest runs at EL0/EL1
/// and KVM hosts at EL2; from KVM's perspective every guest-side
/// watchpoint trap is "from a lower EL" (LOW), and only EL2's own
/// debug traps would be CUR (which KVM does not arm).
#[cfg(target_arch = "aarch64")]
const ESR_ELX_EC_WATCHPT_LOW: u32 = 0x34;
/// `ESR_ELx_EC` decoded value for a software-step exception taken
/// from a lower exception level. KVM raises this through
/// `KVM_EXIT_DEBUG` after a `KVM_GUESTDBG_SINGLESTEP`-armed
/// `KVM_RUN` retires exactly one instruction (kernel
/// `arch/arm64/kvm/handle_exit.c::kvm_handle_guest_debug` switches
/// on `ESR_ELx_EC_SOFTSTP_LOW` and toggles `DBG_SPSR_SS`). We use
/// it to detect "the offending store has retired" after stepping
/// past a watchpoint trap, so the next `self_arm_watchpoint` call
/// can restore the slot's WCR.E=1. Pinned per
/// `arch/arm64/include/asm/esr.h` `ESR_ELx_EC_SOFTSTP_LOW = 0x32`.
#[cfg(target_arch = "aarch64")]
const ESR_ELX_EC_SOFTSTP_LOW: u32 = 0x32;
/// Bit shift of the `ESR_ELx_EC` field within the lower 32 bits of
/// the ESR_EL2 value KVM hands userspace as
/// `kvm_debug_exit_arch.hsr`. Pinned per `arch/arm64/include/asm/
/// esr.h` `ESR_ELx_EC_SHIFT = 26`.
#[cfg(target_arch = "aarch64")]
const ESR_ELX_EC_SHIFT: u32 = 26;
/// Mask applied to `(hsr >> ESR_ELx_EC_SHIFT)` to extract the
/// 6-bit EC field. Pinned per `ESR_ELx_EC(esr) = (esr & ESR_ELx_
/// EC_MASK) >> ESR_ELx_EC_SHIFT` in the same kernel header.
#[cfg(target_arch = "aarch64")]
const ESR_ELX_EC_MASK: u32 = 0x3F;

/// Dispatch a `KVM_EXIT_DEBUG` watchpoint trap to the matching slot's
/// latch. Reads the per-arch identifier (DR6 on x86_64; ESR EC + FAR
/// on aarch64), gates the slot-0 trigger on the post-store
/// `exit_kind` value, and writes the appropriate `hit` flag for the
/// freeze coordinator to observe.
///
/// `armed_slots` is the per-thread mirror of currently-armed KVAs
/// (one entry per slot) maintained by `self_arm_watchpoint`. On
/// aarch64 it carries the original (un-aligned) KVA so the
/// FAR range check covers the exact 4 bytes the watchpoint targets,
/// not the 8-byte block DBGWVR addresses. On x86_64 the entry is
/// also the requested KVA (DR0..DR3 hold full addresses) but is not
/// used by this helper — DR6 alone identifies which slot fired.
///
/// `single_step_pending` and `single_step_slot` are the per-vCPU
/// loop-local single-step bookkeeping the aarch64 path uses to step
/// past a fired watchpoint:
///
///   - On `EC = ESR_ELx_EC_WATCHPT_LOW (0x34)` with at least one
///     slot whose 4-byte FAR window contains the fault address,
///     after latching `hit` on every matched slot the helper sets
///     `*single_step_pending = true` and stores a 4-bit bitmap of
///     matched slot indices into `*single_step_slot` (bit i set
///     ⇒ slot i was matched). The next loop iteration's
///     `self_arm_watchpoint` call notices the flipped flag,
///     reissues KVM_SET_GUEST_DEBUG with WCR.E=0 on EVERY matched
///     slot (peer arms stay enabled), asserts
///     `KVM_GUESTDBG_SINGLESTEP`, and the following KVM_RUN
///     executes exactly one instruction past the offending store.
///     A 4-bit bitmap is sufficient because there are only four
///     hardware watchpoint slots; multiple matches happen when
///     `arm_user_watchpoint` placed two slots on overlapping
///     KVAs (no duplicate-rejection — see comment on the FAR
///     range loop).
///   - On `EC = ESR_ELx_EC_SOFTSTP_LOW (0x32)` with the flag set,
///     the helper clears `*single_step_pending` (without latching).
///     The next `self_arm_watchpoint` call restores WCR.E=1 on
///     every previously-disabled slot and drops the singlestep
///     bit. The mask in `*single_step_slot` is functionally not
///     consulted on this transition because the per-slot E
///     selector at `vcpu.rs::self_arm_watchpoint` short-circuits
///     on `single_step_pending == false`; restoring WCR.E
///     globally is correct because all slots have valid
///     `request_kva` published.
///
/// Both fields are inert on x86_64 — the x86 watchpoint trap is
/// taken AFTER the store retires (Intel SDM Vol. 3B 17.2.4
/// "Trap-class debug exceptions"), so re-entry advances normally
/// without the disable-step-rearm dance.
pub(crate) fn dispatch_watchpoint_hit(
    watchpoint: &WatchpointArm,
    debug_arch: &kvm_bindings::kvm_debug_exit_arch,
    armed_slots: &[u64; 4],
    single_step_pending: &mut bool,
    single_step_slot: &mut usize,
) {
    #[cfg(target_arch = "x86_64")]
    {
        // DR6 layout (Intel SDM Vol. 3B 17.2.5): bits 0-3 (B0..B3)
        // indicate which DR fired. Bit 14 (BS) signals single-step.
        // KVM populates `kvm_run.debug.arch.dr6` from the just-
        // fired exit's qualification field (`vmx_get_exit_qual`
        // in `arch/x86/kvm/vmx/vmx.c::handle_exception_nmi`),
        // not from the architectural DR6 register, so the bits we
        // see reflect ONLY the slots that fired on THIS exit —
        // not stale "sticky" bits from prior exits. The dedup
        // gate on `WatchpointArm::hit` (CAS in `latch_*`) handles
        // the cross-vCPU race where two vCPUs each fire on the
        // same slot before either has been processed by the
        // freeze coordinator: only the first false→true transition
        // wakes the coordinator's `hit_evt`.
        //
        // Single-step is aarch64-only — the x86 watchpoint trap
        // is taken AFTER the offending store retires (Intel SDM
        // Vol. 3B 17.2.4 "Trap-class debug exceptions"), so re-
        // entering KVM_RUN advances normally without the
        // disable-step-rearm dance. Consume the unused inputs to
        // keep the per-arch helper signature shared.
        let _ = armed_slots;
        let _ = (&mut *single_step_pending, &mut *single_step_slot);
        let dr6 = debug_arch.dr6;
        let trap_bits = (dr6 & 0xF) as u8;
        if trap_bits == 0 {
            // KVM exited via KVM_EXIT_DEBUG with no DR0..3 trap
            // bits set — possible when a single-step (BS, bit 14)
            // or task-switch (BT, bit 15) fired without a data/
            // code breakpoint match. ktstr never arms BS/BT, so
            // this is either a host-side debug stub leaking
            // through or a synthetic exit — log and ignore.
            // Mirrors the aarch64 "no FAR match" debug log so
            // both arches surface unexpected debug exits the
            // same way.
            tracing::debug!(
                dr6,
                "KVM_EXIT_DEBUG fired with no DR0..DR3 trap bit set \
                 (BS/BT or spurious); not latching"
            );
            return;
        }
        if trap_bits & 0x1 != 0 {
            latch_slot0_with_gate(watchpoint);
        }
        for idx in 0..3 {
            if trap_bits & (1u8 << (idx + 1)) != 0 {
                watchpoint.latch_user_hit(idx);
            }
        }
    }
    #[cfg(target_arch = "aarch64")]
    {
        // ARM debug exit payload (kernel
        // `arch/arm64/kvm/handle_exit.c::kvm_handle_guest_debug`):
        //   `hsr` = lower 32 bits of ESR_EL2
        //   `far` = FAR_EL2 (set only when ESR.EC == WATCHPT_LOW)
        // The EC field at bits [31:26] of ESR distinguishes
        // watchpoint exceptions from breakpoints / soft-step / BRK.
        let ec = (debug_arch.hsr >> ESR_ELX_EC_SHIFT) & ESR_ELX_EC_MASK;
        if ec == ESR_ELX_EC_SOFTSTP_LOW {
            // Software-step exception following a watchpoint hit.
            // The kernel sets cpsr.SS in
            // `kvm_handle_guest_debug` to advertise that exactly
            // one instruction retired since the prior fire; we
            // clear `single_step_pending` so the next
            // `self_arm_watchpoint` call restores the slot's
            // WCR.E=1 and drops `KVM_GUESTDBG_SINGLESTEP`. Do NOT
            // latch any `hit` flag here — the original
            // WATCHPT_LOW exit already latched the freeze
            // trigger; this exit only signals "one instruction
            // executed cleanly past the watched store".
            //
            // If the flag is NOT set we got a soft-step exit
            // we did not request (e.g. host kernel quirk, peer
            // tooling); log and ignore — there is no slot to
            // restore.
            if *single_step_pending {
                *single_step_pending = false;
                // Zero `single_step_slot` defensively. The
                // per-slot E selector in
                // `vcpu.rs::self_arm_watchpoint` already
                // short-circuits on `single_step_pending ==
                // false`, so a stale mask cannot disable a slot
                // — but a future regression that drops the
                // short-circuit would silently disable whatever
                // slots the stale mask still flags. Zeroing
                // here makes the post-step state purely
                // reflect "no slots pending step" so downstream
                // readers cannot trip on a leftover bitmap.
                *single_step_slot = 0;
            } else {
                tracing::debug!(
                    hsr = debug_arch.hsr,
                    "KVM_EXIT_DEBUG soft-step EC with no \
                     single-step pending; ignoring (likely \
                     spurious kernel-side step exit)"
                );
            }
            return;
        }
        if ec != ESR_ELX_EC_WATCHPT_LOW {
            tracing::debug!(
                hsr = debug_arch.hsr,
                ec,
                "KVM_EXIT_DEBUG with non-watchpoint EC; ignoring \
                 (breakpoint/BRK paths are not used by ktstr)"
            );
            return;
        }
        let far = debug_arch.far;
        // ARM ARM D2.10.5: FAR may be imprecise for unaligned
        // accesses. This exact-range check is correct for atomic_t
        // writes (aligned 4-byte stores via atomic_set/cmpxchg) but
        // would miss imprecise hits from unaligned multi-byte
        // accesses spanning the watched range.
        //
        // Range-match FAR against each armed slot's 4-byte
        // window. `armed_slots[i]` is the requested KVA (un-
        // aligned); the watch covers `[kva, kva + 4)`. Multiple
        // slots may match if their watched ranges overlap (e.g.
        // two `Op::WatchSnapshot` registrations on adjacent
        // 4-byte fields of the same struct word). `arm_user_
        // watchpoint` allocates by free-slot search and does NOT
        // reject duplicate KVAs, so overlapping arms are
        // possible. The loop iterates all four slots and latches
        // every match — overlapping arms are never silently
        // dropped.
        let mut matched_mask: u8 = 0;
        for (i, kva) in armed_slots.iter().enumerate() {
            if *kva == 0 {
                continue;
            }
            if far >= *kva && far < kva.saturating_add(4) {
                matched_mask |= 1 << i;
                if i == 0 {
                    latch_slot0_with_gate(watchpoint);
                } else {
                    watchpoint.latch_user_hit(i - 1);
                }
            }
        }
        if matched_mask == 0 {
            tracing::debug!(
                hsr = debug_arch.hsr,
                far,
                armed = ?armed_slots,
                "KVM_EXIT_DEBUG watchpoint fired but FAR matched no \
                 armed slot (possible KVM watchpoint match-distance \
                 fallback or stale arm); not latching"
            );
            return;
        }
        // The aarch64 watchpoint trap fires BEFORE the offending
        // store retires (ARM ARM D2.10.5: "the exception is
        // taken on the instruction that would have made the
        // access"). Re-entering KVM_RUN without intervention
        // replays the same store and re-trips the watchpoint
        // forever. Mirror the kernel's
        // `arch/arm64/kernel/hw_breakpoint.c::do_watchpoint`
        // recipe with a two-mechanism dance:
        //
        //   - `KVM_GUESTDBG_SINGLESTEP` (which sets MDSCR_EL1.SS
        //     in the kernel's `setup_external_mdscr`) is what
        //     causes KVM to retire EXACTLY ONE guest instruction
        //     and exit with `EC = ESR_ELx_EC_SOFTSTP_LOW (0x32)`.
        //     This advances the PC past the watched store. MDSCR.
        //     SS does NOT suppress watchpoint exceptions on the
        //     stepped instruction (per ARM ARM D2.12, software-
        //     step state still respects WCR.E for watchpoints
        //     that match the stepped access).
        //
        //   - Clearing WCR.E=0 on every matched slot is what
        //     prevents the watched store from re-tripping the
        //     watchpoint on the single-step pass. Without this,
        //     the same instruction that originally trapped would
        //     re-trap on its replay (the trap is taken BEFORE
        //     the access; the access has not retired yet).
        //
        // `single_step_slot` carries a 4-bit mask of every
        // matched slot index (bit i set ⇒ slot i must have its
        // WCR.E cleared during the single-step pass). Multiple
        // bits can be set: `arm_user_watchpoint` allocates by
        // free-slot search and does NOT reject duplicate KVAs,
        // so two slots may watch overlapping ranges and fire
        // simultaneously on the same store. `self_arm_watch
        // point` walks the mask and clears WCR.E on every set
        // bit; when `single_step_pending` clears on the
        // following SOFTSTP_LOW exit, all slots get WCR.E=1
        // restored.
        *single_step_pending = true;
        *single_step_slot = matched_mask as usize;
    }
}

/// Slot-0 latch with the post-store `exit_kind` value gate. Reads
/// the host pointer the freeze coordinator published, compares
/// against [`SCX_EXIT_ERROR_THRESHOLD`], and latches the failure-
/// trigger only on error-class transitions.
///
/// `kind_host_ptr` is guaranteed non-null when this helper runs.
/// The freeze coordinator publishes the pair in `kind_host_ptr →
/// request_kva` order with matching `Release` stores (see
/// `freeze_coord.rs::run_coord_loop`, where the err_exit publish
/// issues the `kind_host_ptr` store BEFORE the `request_kva`
/// store). [`super::vcpu::self_arm_watchpoint`] only programs the
/// hardware watchpoint after observing a non-zero `request_kva`
/// via an `Acquire` load — that load synchronises-with the
/// publisher's `Release`, which makes the prior `kind_host_ptr`
/// store visible too. Once armed, a fire reaches this helper only
/// when both stores are visible. The pointer is never invalidated
/// for the VM lifetime: `vm.guest_mem` (which backs the host
/// mapping) is dropped only after every vCPU thread has joined, so
/// the host-side mapping at this address strictly outlives every
/// reader of this pointer.
///
/// On aarch64 an `Acquire` fence pairs with the guest's store: by
/// the time KVM_RUN returns `KVM_EXIT_DEBUG` the trap-into-EL2 +
/// host-context-restore path has already issued an architectural
/// context-synchronization event (ERET, eret-to-EL1 from the
/// hypervisor save/restore), but Rust's memory model does not
/// know about those. The fence makes the `read_volatile` of the
/// host pointer ordered-after that synchronization in Rust's
/// happens-before graph, matching what the x86_64 path gets for
/// free from TSO.
///
/// A null observation here would be a publication-invariant
/// violation; we still check at runtime so a regression in the
/// publisher cannot be turned into a `read_volatile` of a null
/// pointer (UB). The check costs one branch on the cold debug
/// trap path — negligible — and surfaces the invariant break as
/// a `tracing::error!` instead of crashing the run.
fn latch_slot0_with_gate(watchpoint: &WatchpointArm) {
    let host_ptr = watchpoint.kind_host_ptr.load(Ordering::Acquire);
    if host_ptr.is_null() {
        tracing::error!(
            "latch_slot0_with_gate: kind_host_ptr null at fire time — \
             publication invariant broken (request_kva non-zero must \
             imply kind_host_ptr non-null per the Release-store \
             ordering in freeze_coord.rs::run_coord_loop). Skipping \
             slot-0 latch; the BPF .bss late-trigger fallback in the \
             freeze coordinator's poll loop remains active."
        );
        return;
    }
    // Publish ordering: the guest's store is globally visible by
    // the time KVM exits to userspace, but Rust's memory model
    // requires an explicit Acquire fence on weakly-ordered hosts
    // (aarch64) to make the host-pointer read happens-after the
    // guest store in the Rust abstract machine. On x86_64 TSO
    // gives us this for free; the explicit fence is a no-op
    // codegen-wise but keeps the operation ordered in std::sync
    // terms across both arches.
    std::sync::atomic::fence(Ordering::Acquire);
    // SAFETY: `kind_host_ptr` was published by the freeze
    // coordinator before `request_kva` (Release), and the
    // `request_kva` non-zero load that triggered the arm is the
    // synchronizes-with edge for this read. The pointer addresses
    // a u32 inside the guest's `scx_sched` slab page, which stays
    // mapped for the VM lifetime per the `ReservationGuard`
    // contract. Non-null per the `is_null` early-return above.
    let kind = unsafe { std::ptr::read_volatile(host_ptr) };
    if kind >= SCX_EXIT_ERROR_THRESHOLD {
        watchpoint.latch_hit();
    } else {
        tracing::debug!(
            kind,
            threshold = SCX_EXIT_ERROR_THRESHOLD,
            "watchpoint fired on non-error exit_kind transition \
             (e.g. SCX_EXIT_DONE on clean shutdown); skipping \
             freeze trigger"
        );
    }
}

/// Unified per-vCPU KVM_RUN loop for AP threads.
///
/// HLT on APs: check kill + continue on both arches (KVM delivers
/// interrupts to wake the vCPU). Shutdown sets the kill flag so all
/// other vCPUs exit.
///
/// `watchpoint` carries the failure-dump trigger contract: each
/// iteration polls `watchpoint.request_kva` and self-arms a hardware
/// data-write watchpoint on `*scx_root->exit_kind` once the freeze
/// coordinator has resolved its KVA. When the kernel later writes
/// the field, KVM exits via `VcpuExit::Debug`; this loop sets
/// `watchpoint.hit` so the freeze coordinator's late-trigger poll
/// fires immediately. The arm is one-shot per KVA value (the
/// per-vCPU `armed_kva` slot suppresses re-arms after the ioctl
/// lands).
///
/// Freeze handling: when the freeze flag is set, the vCPU thread
/// performs the Cloud Hypervisor pause/snapshot drain dance
/// (set_immediate_exit(1) → vcpu.run() → set_immediate_exit(0)) so
/// any in-flight PIO/MMIO operation completes inside the KVM_RUN
/// ioctl before the thread parks. The drain is necessary because
/// KVM_EXIT_IO/MMIO leave the operation only partially complete on
/// the kernel side; userspace must re-enter KVM_RUN to commit it.
/// After draining, the thread sets `parked=true` (Release-ordered so
/// the host's subsequent guest-memory reads happen-after the
/// drain), then polls freeze on park_timeout. The Acquire load on
/// `parked` from the freeze coordinator IS the memory barrier that
/// makes external-thread guest-memory reads correct on weakly
/// ordered architectures (matches Cloud Hypervisor's pause
/// pattern). The kick that triggers freeze observation uses
/// Firecracker's SIGRTMIN+immediate_exit pattern, but the drain
/// dance itself is Cloud Hypervisor-specific.
#[allow(clippy::too_many_arguments)]
pub(crate) fn vcpu_run_loop_unified(
    vcpu: &mut kvm_ioctls::VcpuFd,
    com1: &Arc<PiMutex<console::Serial>>,
    com2: &Arc<PiMutex<console::Serial>>,
    virtio_con: Option<&Arc<PiMutex<virtio_console::VirtioConsole>>>,
    virtio_blk: Option<&Arc<PiMutex<virtio_blk::VirtioBlk>>>,
    virtio_net: Option<&Arc<PiMutex<virtio_net::VirtioNet>>>,
    kill: &Arc<AtomicBool>,
    kill_evt: &Arc<EventFd>,
    freeze: &Arc<AtomicBool>,
    parked: &Arc<AtomicBool>,
    regs_slot: &Arc<std::sync::Mutex<Option<VcpuRegSnapshot>>>,
    watchpoint: &Arc<WatchpointArm>,
    has_immediate_exit: bool,
    parked_evt: Option<&Arc<EventFd>>,
    thaw_evt: Option<&Arc<EventFd>>,
) {
    // Per-AP `armed_slots` mirrors the BSP-side slot array in
    // `freeze_coord::run_bsp_loop`. Index 0 = DR0 (err_exit watchpoint
    // for `*scx_root->exit_kind`); indices 1..=3 = DR1/DR2/DR3 (user
    // `Op::WatchSnapshot` arms). All start at `0` until the freeze
    // coordinator publishes resolved KVAs. The array is a per-thread
    // local so the per-iteration arm check is four Acquire loads
    // with no cross-thread synchronization beyond the published
    // requests. `arm_failures` counts consecutive non-EINTR ioctl
    // failures; EINTR is transient (SIGRTMIN kick race) and does NOT
    // increment, so a kicked-mid-arm vCPU retries instead of
    // permanently disabling the watchpoint.
    let mut armed_slots: [u64; 4] = [0; 4];
    let mut arm_failures: u8 = 0;
    // aarch64 watchpoint single-step bookkeeping. On aarch64 the
    // hardware watchpoint trap is taken BEFORE the offending store
    // retires (ARM ARM D2.10.5: "the exception is taken on the
    // instruction that would have made the access"), so re-entering
    // KVM_RUN replays the same instruction and re-trips the
    // watchpoint forever. Mirroring the kernel's
    // `arch/arm64/kernel/hw_breakpoint.c` recipe, we disable the
    // fired slot's WCR.E and assert KVM_GUESTDBG_SINGLESTEP for
    // exactly one KVM_RUN; the next KVM_EXIT_DEBUG carries
    // EC=ESR_ELx_EC_SOFTSTP_LOW (0x32), at which point we clear
    // `single_step_pending` and `self_arm_watchpoint` reissues
    // KVM_SET_GUEST_DEBUG with WCR.E restored to 1 and
    // KVM_GUESTDBG_SINGLESTEP cleared. Inert on x86_64 (the trap
    // there is taken AFTER the store, so re-entry advances
    // normally); the locals still pass through to keep the
    // per-arch helper signatures shared.
    let mut single_step_pending: bool = false;
    let mut single_step_slot: usize = 0;
    let mut armed_single_step: bool = false;
    loop {
        if kill.load(Ordering::Acquire) {
            break;
        }
        // Honour a pending freeze before re-entering KVM_RUN.
        if freeze.load(Ordering::Acquire) {
            handle_freeze(
                vcpu,
                has_immediate_exit,
                kill,
                freeze,
                parked,
                regs_slot,
                parked_evt.map(|a| a.as_ref()),
                thaw_evt.map(|a| a.as_ref()),
                Some(kill_evt.as_ref()),
            );
            if kill.load(Ordering::Acquire) {
                break;
            }
        }
        // Self-arm the failure-dump watchpoint when the coordinator
        // publishes (or republishes) a request KVA. Cheap (atomic
        // load + compare against `armed_kva`) when no new arm is
        // pending. Mirrors `run_bsp_loop`'s arm-before-run pattern;
        // both paths share `WatchpointArm` so a fire on either
        // triggers the late-snapshot rendezvous. Also drives the
        // aarch64 watchpoint single-step transition: when
        // `single_step_pending` is set by the prior watchpoint
        // exit, this call reissues KVM_SET_GUEST_DEBUG with the
        // fired slot's WCR.E cleared and KVM_GUESTDBG_SINGLESTEP
        // asserted; when the SOFTSTP_LOW exit clears the flag, the
        // next call restores WCR.E=1 and drops the singlestep bit.
        self_arm_watchpoint(
            vcpu,
            watchpoint,
            &mut armed_slots,
            &mut arm_failures,
            single_step_pending,
            single_step_slot,
            &mut armed_single_step,
        );

        match vcpu.run() {
            Ok(mut exit) => {
                if matches!(exit, VcpuExit::Hlt) {
                    if kill.load(Ordering::Acquire) {
                        break;
                    }
                    continue;
                }
                // KVM_EXIT_DEBUG fires when the armed hardware
                // data-write watchpoint trips on a guest write to
                // `*scx_root->exit_kind`. The kernel writes the
                // field on BOTH error transitions
                // (`scx_error -> SCX_EXIT_ERROR/_BPF/_STALL >=
                // 1024`) AND clean shutdown
                // (`scx_unregister -> SCX_EXIT_DONE = 1`). Only the
                // error transitions should trigger the failure-dump
                // freeze; firing on every clean test exit is a
                // regression. Read the post-store value from the
                // host pointer the coordinator published and gate
                // `hit` on the error threshold. The watchpoint is
                // left armed regardless: the coordinator's freeze +
                // thaw is synchronous with the dump emission, and a
                // future error after a clean transition would still
                // fire (slab page lifetime — the scheduler's
                // `scx_sched` is not freed until well after the
                // last `exit_kind` write).
                if let VcpuExit::Debug(debug_arch) = &exit {
                    dispatch_watchpoint_hit(
                        watchpoint,
                        debug_arch,
                        &armed_slots,
                        &mut single_step_pending,
                        &mut single_step_slot,
                    );
                    if kill.load(Ordering::Acquire) {
                        break;
                    }
                    continue;
                }
                match classify_exit(
                    com1,
                    com2,
                    virtio_con.map(|a| a.as_ref()),
                    virtio_blk.map(|a| a.as_ref()),
                    virtio_net.map(|a| a.as_ref()),
                    &mut exit,
                ) {
                    Some(ExitAction::Continue) | None => {}
                    Some(ExitAction::Shutdown) => {
                        kill.store(true, Ordering::Release);
                        // Wake the freeze coordinator's epoll loop
                        // so it sees the kill flag without waiting
                        // up to one full epoll timeout. Failure
                        // (EAGAIN under EFD_NONBLOCK from a
                        // saturated counter) is benign — any prior
                        // pending edge already wakes the coord, and
                        // the AtomicBool above remains the source
                        // of truth.
                        let _ = kill_evt.write(1);
                        break;
                    }
                    Some(ExitAction::Fatal(_)) => {
                        // AP fatal exit (FailEntry / InternalError):
                        // surface in tracing AND propagate the kill
                        // signal. Without `kill.store(true)` and the
                        // kill_evt write, the AP thread silently
                        // exits while peer vCPUs and the freeze
                        // coordinator stay running — peers eventually
                        // hit FREEZE_RENDEZVOUS_TIMEOUT instead of
                        // shutting down promptly. Mirrors the
                        // Shutdown arm's kill-propagation pattern.
                        tracing::error!("AP fatal exit");
                        kill.store(true, Ordering::Release);
                        let _ = kill_evt.write(1);
                        break;
                    }
                }
            }
            Err(e) => {
                if e.errno() == libc::EINTR || e.errno() == libc::EAGAIN {
                    vcpu.set_kvm_immediate_exit(0);
                    if kill.load(Ordering::Acquire) {
                        break;
                    }
                    continue;
                }
                if kill.load(Ordering::Acquire) {
                    break;
                }
            }
        }

        if kill.load(Ordering::Acquire) {
            break;
        }
    }
}

/// Drain pending PIO/MMIO state and park the vCPU until freeze
/// clears. Called from the run loop when the freeze flag is observed,
/// and from `mod.rs::run_bsp_loop` for the same purpose on the BSP
/// thread.
///
/// The drain dance — `set_immediate_exit(1) → vcpu.run() →
/// set_immediate_exit(0)` — is the Cloud Hypervisor pause/snapshot
/// pattern for completing in-flight I/O before pausing. KVM_RUN with
/// immediate_exit=1 returns -EINTR without entering the guest but
/// still commits any pending PIO/MMIO state from the previous exit
/// (per the KVM API contract: pending I/O is committed at the start
/// of KVM_RUN even when immediate_exit prevents guest entry).
/// `has_immediate_exit` gates the dance — without
/// KVM_CAP_IMMEDIATE_EXIT, calling `vcpu.run()` here would re-enter
/// the guest instead of returning EINTR, so the drain step is
/// skipped on kernels that lack the cap. The freeze rendezvous
/// itself still works (set parked, await thaw); only the I/O drain
/// is skipped.
///
/// After the drain, the thread sets `parked=true` with Release
/// ordering and polls freeze on `park_timeout(10ms)` until the
/// coordinator clears it. The thaw path uses no explicit unpark —
/// the 10ms park_timeout cadence picks up the cleared freeze flag
/// within at most 10 ms, which is well below the dump latency
/// budget.
///
/// `kill` is honoured throughout: a shutdown signal during the park
/// loop wins over freeze and the function returns to the caller's
/// kill-check at the top of the loop.
#[allow(clippy::too_many_arguments)]
pub(crate) fn handle_freeze(
    vcpu: &mut kvm_ioctls::VcpuFd,
    has_immediate_exit: bool,
    kill: &Arc<AtomicBool>,
    freeze: &Arc<AtomicBool>,
    parked: &Arc<AtomicBool>,
    regs_slot: &Arc<std::sync::Mutex<Option<VcpuRegSnapshot>>>,
    parked_evt: Option<&EventFd>,
    thaw_evt: Option<&EventFd>,
    kill_evt: Option<&EventFd>,
) {
    // Drain dance: complete any pending PIO/MMIO before parking.
    // Skipped on kernels without KVM_CAP_IMMEDIATE_EXIT, where
    // calling vcpu.run() with the cap absent would re-enter the
    // guest instead of returning EINTR.
    if has_immediate_exit {
        vcpu.set_kvm_immediate_exit(1);
        // Drain dance: KVM_RUN with immediate_exit=1 commits any
        // pending PIO/MMIO from the prior exit and returns EINTR
        // without entering the guest (per the KVM API contract). EINTR
        // is the expected outcome; any other error means KVM rejected
        // the ioctl (e.g. KVM_RUN unsupported state, vCPU corruption)
        // and the freeze coordinator's subsequent guest-memory reads
        // may observe partial state. Log non-EINTR explicitly so a
        // real KVM regression is not silently swallowed.
        if let Err(e) = vcpu.run()
            && e.errno() != libc::EINTR
        {
            tracing::warn!(
                err = %e,
                "handle_freeze: drain KVM_RUN failed with non-EINTR — \
                 pending PIO/MMIO may not have committed before park"
            );
        }
        vcpu.set_kvm_immediate_exit(0);
    }

    // Capture vCPU registers BEFORE the Release store on `parked`.
    // KVM_GET_REGS / KVM_GET_SREGS are vCPU-fd-bound ioctls — they
    // must run on the vCPU thread (not cross-thread from the
    // coordinator). Capturing here means the regs slot's Mutex
    // store is happens-before the coordinator's Acquire on
    // `parked`, so the coordinator can read the slot via the same
    // synchronizes-with edge that makes its guest-memory reads
    // correct. A failed capture stores `None`; the dump shows
    // "registers unavailable" rather than panicking the freeze.
    let snapshot = capture_vcpu_regs(vcpu);
    *regs_slot.lock().unwrap_or_else(|e| e.into_inner()) = snapshot;

    // Acknowledge frozen state. The Release store synchronizes-with
    // the coordinator's Acquire load on `parked`, providing the
    // happens-before edge that makes the coordinator's subsequent
    // guest-memory reads correct.
    parked.store(true, Ordering::Release);

    // Wake the freeze coordinator's rendezvous wait — write to the
    // shared `parked_evt` AFTER the Release store on `parked`. The
    // coordinator drains the eventfd once and then re-checks every
    // vCPU's `parked` flag plus the worker's `paused` flag. The
    // ordering is load-bearing: the coordinator's Acquire load on
    // `parked` happens-after this Release, so its subsequent
    // guest-memory reads observe every queue mutation the vCPU
    // performed before the drain dance.
    //
    // EAGAIN under EFD_NONBLOCK from a saturated counter is benign:
    // the AtomicBool is the source of truth, and any prior pending
    // edge already wakes the coordinator. Other errnos (EBADF,
    // EINVAL) signal real eventfd breakage and warrant a higher-
    // severity log than the benign-saturation case. Either way we do
    // not propagate — the parked AtomicBool is the source of truth
    // for the freeze rendezvous, and the coordinator's epoll-wait
    // backstop bounds wake latency without the eventfd edge.
    if let Some(evt) = parked_evt
        && let Err(e) = evt.write(1)
    {
        if e.raw_os_error() == Some(libc::EAGAIN) {
            tracing::debug!(
                err = %e,
                "handle_freeze: parked_evt write returned EAGAIN \
                 (eventfd counter saturated; benign — coordinator \
                 already has a pending wake edge)"
            );
        } else {
            tracing::warn!(
                err = %e,
                "handle_freeze: parked_evt write failed with non-EAGAIN \
                 errno — eventfd may be broken; freeze coordinator wake \
                 falls back to epoll backstop"
            );
        }
    }

    // Park until freeze clears or shutdown wins. The thaw_evt
    // is written by the freeze coordinator alongside
    // `freeze.store(false, Release)`; poll on [thaw_evt, kill_evt]
    // with a 100 ms backstop so a missed eventfd write (counter
    // overflow / EAGAIN) still drops the parked vCPU within
    // bounded latency. Without the thaw_evt the legacy
    // park_timeout(10 ms) cadence applies as the only source of
    // wake.
    use std::os::fd::AsRawFd;
    while freeze.load(Ordering::Acquire) {
        if kill.load(Ordering::Acquire) {
            break;
        }
        match (thaw_evt, kill_evt) {
            (Some(thaw), kev) => {
                let mut pfds = [
                    libc::pollfd {
                        fd: thaw.as_raw_fd(),
                        events: libc::POLLIN,
                        revents: 0,
                    },
                    libc::pollfd {
                        fd: kev.map_or(-1, |k| k.as_raw_fd()),
                        events: libc::POLLIN,
                        revents: 0,
                    },
                ];
                let nfds = if kev.is_some() { 2 } else { 1 };
                unsafe {
                    libc::poll(pfds.as_mut_ptr(), nfds as libc::nfds_t, 100);
                }
                // Do NOT drain the shared `thaw_evt` here. The
                // coordinator writes to thaw_evt ONCE per thaw and
                // every parked AP polls the SAME fd; if the first
                // wake-winner drains the counter, every other AP's
                // poll blocks for the full 100 ms backstop instead
                // of waking immediately. Leaving the eventfd level
                // high means poll returns immediately for every AP
                // — fast thaw across all peers. The `freeze.load
                // (Acquire)` re-check at the top of the loop is the
                // source of truth: once `freeze` clears the loop
                // exits regardless of eventfd state.
            }
            (None, _) => {
                // No thaw_evt plumbed (e.g. interactive shell path
                // that doesn't run a freeze coordinator). Fall back
                // to the legacy park_timeout cadence — the freeze
                // flag will never be set in that path so this
                // branch is structurally unreachable for real
                // shutdowns, but the safe-by-construction fallback
                // keeps the function callable with all None.
                std::thread::park_timeout(std::time::Duration::from_millis(10));
            }
        }
    }

    // Resume: clear parked so subsequent freeze cycles are observable.
    parked.store(false, Ordering::Release);
}

// ---------------------------------------------------------------------------
// I/O dispatch — shared between BSP and AP run loops
// ---------------------------------------------------------------------------

const KVM_SYSTEM_EVENT_SHUTDOWN: u32 = 1;
const KVM_SYSTEM_EVENT_RESET: u32 = 2;

/// Classified vCPU exit action from `classify_exit`.
pub(crate) enum ExitAction {
    /// Continue running (I/O handled, etc.).
    Continue,
    /// Clean shutdown (system reset, VcpuExit::Shutdown, etc.).
    Shutdown,
    /// Fatal error. `Some(reason)` for FailEntry, `None` for InternalError.
    Fatal(Option<u64>),
}

/// Classify a VcpuExit into an ExitAction, dispatching arch-specific I/O.
///
/// Returns `None` for HLT (caller handles: check kill flag, continue).
/// Takes the exit by mutable reference so IoIn/MmioRead data buffers
/// can be written back.
///
/// On aarch64, serial and virtio-console are dispatched via MMIO.
/// On x86_64, serial is dispatched via port I/O; virtio-console via MMIO.
#[allow(clippy::too_many_arguments)]
pub(crate) fn classify_exit(
    com1: &PiMutex<console::Serial>,
    com2: &PiMutex<console::Serial>,
    virtio_con: Option<&PiMutex<virtio_console::VirtioConsole>>,
    virtio_blk: Option<&PiMutex<virtio_blk::VirtioBlk>>,
    virtio_net: Option<&PiMutex<virtio_net::VirtioNet>>,
    exit: &mut VcpuExit,
) -> Option<ExitAction> {
    match exit {
        #[cfg(target_arch = "x86_64")]
        VcpuExit::IoOut(port, data) => {
            if dispatch_io_out(com1, com2, *port, data) {
                Some(ExitAction::Shutdown)
            } else {
                Some(ExitAction::Continue)
            }
        }
        #[cfg(target_arch = "x86_64")]
        VcpuExit::IoIn(port, data) => {
            dispatch_io_in(com1, com2, *port, data);
            Some(ExitAction::Continue)
        }
        #[cfg(target_arch = "aarch64")]
        VcpuExit::MmioWrite(addr, data) => {
            // aarch64 has no MMIO-side shutdown signal — guest reboot
            // arrives as VcpuExit::SystemEvent (PSCI), handled below.
            dispatch_mmio_write(com1, com2, virtio_con, virtio_blk, virtio_net, *addr, data);
            Some(ExitAction::Continue)
        }
        #[cfg(target_arch = "aarch64")]
        VcpuExit::MmioRead(addr, data) => {
            dispatch_mmio_read(com1, com2, virtio_con, virtio_blk, virtio_net, *addr, data);
            Some(ExitAction::Continue)
        }
        VcpuExit::Hlt => None,
        VcpuExit::Shutdown => Some(ExitAction::Shutdown),
        VcpuExit::SystemEvent(event_type, _) => {
            if *event_type == KVM_SYSTEM_EVENT_SHUTDOWN || *event_type == KVM_SYSTEM_EVENT_RESET {
                Some(ExitAction::Shutdown)
            } else {
                Some(ExitAction::Continue)
            }
        }
        VcpuExit::FailEntry(reason, _cpu) => Some(ExitAction::Fatal(Some(*reason))),
        VcpuExit::InternalError => Some(ExitAction::Fatal(None)),
        #[cfg(target_arch = "x86_64")]
        VcpuExit::MmioRead(addr, data) => {
            if let Some(vc) = virtio_con {
                let base = kvm::VIRTIO_CONSOLE_MMIO_BASE;
                if *addr >= base && *addr < base + virtio_console::VIRTIO_MMIO_SIZE {
                    vc.lock().mmio_read(*addr - base, data);
                    return Some(ExitAction::Continue);
                }
            }
            if let Some(vb) = virtio_blk {
                let base = kvm::VIRTIO_BLK_MMIO_BASE;
                if *addr >= base && *addr < base + virtio_blk::VIRTIO_MMIO_SIZE {
                    vb.lock().mmio_read(*addr - base, data);
                    return Some(ExitAction::Continue);
                }
            }
            if let Some(vn) = virtio_net {
                let base = kvm::VIRTIO_NET_MMIO_BASE;
                if *addr >= base && *addr < base + virtio_net::VIRTIO_MMIO_SIZE {
                    vn.lock().mmio_read(*addr - base, data);
                    return Some(ExitAction::Continue);
                }
            }
            for b in data.iter_mut() {
                *b = 0xff;
            }
            Some(ExitAction::Continue)
        }
        #[cfg(target_arch = "x86_64")]
        VcpuExit::MmioWrite(addr, data) => {
            if let Some(vc) = virtio_con {
                let base = kvm::VIRTIO_CONSOLE_MMIO_BASE;
                if *addr >= base && *addr < base + virtio_console::VIRTIO_MMIO_SIZE {
                    vc.lock().mmio_write(*addr - base, data);
                    return Some(ExitAction::Continue);
                }
            }
            if let Some(vb) = virtio_blk {
                let base = kvm::VIRTIO_BLK_MMIO_BASE;
                if *addr >= base && *addr < base + virtio_blk::VIRTIO_MMIO_SIZE {
                    vb.lock().mmio_write(*addr - base, data);
                    return Some(ExitAction::Continue);
                }
            }
            if let Some(vn) = virtio_net {
                let base = kvm::VIRTIO_NET_MMIO_BASE;
                if *addr >= base && *addr < base + virtio_net::VIRTIO_MMIO_SIZE {
                    vn.lock().mmio_write(*addr - base, data);
                    return Some(ExitAction::Continue);
                }
            }
            Some(ExitAction::Continue)
        }
        _ => None,
    }
}

/// I8042 ports and commands — minimal emulation for x86 guest reboot.
/// The kernel's default reboot method (`reboot=k`) writes CMD_RESET_CPU
/// (0xFE) to the i8042 command port (0x64).
#[cfg(target_arch = "x86_64")]
const I8042_DATA_PORT: u16 = 0x60;
#[cfg(target_arch = "x86_64")]
const I8042_CMD_PORT: u16 = 0x64;
#[cfg(target_arch = "x86_64")]
const I8042_CMD_RESET_CPU: u8 = 0xFE;

/// Dispatch an I/O out to serial ports or system devices.
/// Returns `true` if the caller should exit (system reset detected).
#[cfg(target_arch = "x86_64")]
fn dispatch_io_out(
    com1: &PiMutex<console::Serial>,
    com2: &PiMutex<console::Serial>,
    port: u16,
    data: &[u8],
) -> bool {
    // I8042 reset: kernel writes 0xFE to port 0x64 during reboot.
    if port == I8042_CMD_PORT && data.first() == Some(&I8042_CMD_RESET_CPU) {
        return true;
    }
    // Only lock the matching serial port based on port range.
    if (console::COM1_BASE..console::COM1_BASE + 8).contains(&port) {
        com1.lock().handle_out(port, data);
    } else if (console::COM2_BASE..console::COM2_BASE + 8).contains(&port) {
        com2.lock().handle_out(port, data);
    }
    false
}

/// Dispatch an I/O in from serial ports or system devices.
/// Handles i8042 reads to satisfy the kernel's keyboard probe.
#[cfg(target_arch = "x86_64")]
fn dispatch_io_in(
    com1: &PiMutex<console::Serial>,
    com2: &PiMutex<console::Serial>,
    port: u16,
    data: &mut [u8],
) {
    match port {
        // I8042 status: return 0 (no data, buffer empty).
        I8042_CMD_PORT => {
            if let Some(b) = data.first_mut() {
                *b = 0;
            }
        }
        // I8042 data: return 0 (no keypress).
        I8042_DATA_PORT => {
            if let Some(b) = data.first_mut() {
                *b = 0;
            }
        }
        // Only lock the matching serial port based on port range.
        p if (console::COM1_BASE..console::COM1_BASE + 8).contains(&p) => {
            com1.lock().handle_in(port, data);
        }
        p if (console::COM2_BASE..console::COM2_BASE + 8).contains(&p) => {
            com2.lock().handle_in(port, data);
        }
        _ => {}
    }
}

#[cfg(all(test, target_arch = "x86_64"))]
mod tests {
    use super::*;

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_out_i8042_reset_is_shutdown_signal() {
        // The BSP relies on I8042 reset (port 0x64, 0xFE) for shutdown
        // detection instead of VcpuExit::Hlt. Verify that dispatch_io_out
        // returns true for the reset command.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        assert!(
            dispatch_io_out(&com1, &com2, I8042_CMD_PORT, &[I8042_CMD_RESET_CPU]),
            "I8042 reset (0xFE to port 0x64) must signal shutdown"
        );
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_out_i8042_non_reset() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        assert!(!dispatch_io_out(&com1, &com2, I8042_CMD_PORT, &[0x00]));
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_out_serial_com1() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        // Write 'A' to COM1 THR — should not trigger reset.
        assert!(!dispatch_io_out(&com1, &com2, console::COM1_BASE, b"A"));
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_out_serial_com2() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        assert!(!dispatch_io_out(&com1, &com2, console::COM2_BASE, b"B"));
        let output = com2.lock().output();
        assert!(output.contains('B'));
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_out_unknown_port() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        assert!(!dispatch_io_out(&com1, &com2, 0x1234, &[0xFF]));
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_in_i8042_status() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut data = [0xFFu8; 1];
        dispatch_io_in(&com1, &com2, I8042_CMD_PORT, &mut data);
        assert_eq!(data[0], 0);
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_in_i8042_data() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut data = [0xFFu8; 1];
        dispatch_io_in(&com1, &com2, I8042_DATA_PORT, &mut data);
        assert_eq!(data[0], 0);
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dispatch_io_in_unknown_port() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut data = [0xFFu8; 1];
        dispatch_io_in(&com1, &com2, 0x1234, &mut data);
        assert_eq!(data[0], 0xFF, "unknown port should not modify data");
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn classify_exit_io_out_i8042_reset_is_shutdown() {
        // Cross-checks the dispatch_io_out i8042-reset path through
        // the classify_exit dispatch table — verifies the "true"
        // return from dispatch_io_out maps to ExitAction::Shutdown.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let data = [I8042_CMD_RESET_CPU];
        let mut exit = VcpuExit::IoOut(I8042_CMD_PORT, &data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Shutdown)),
            "IoOut(0x64, [0xFE]) — i8042 reset — must classify as Shutdown"
        );
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn classify_exit_io_out_serial_is_continue() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let data = [b'Z'];
        let mut exit = VcpuExit::IoOut(console::COM1_BASE, &data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Continue)),
            "IoOut to COM1 must classify as Continue (no reboot)"
        );
        // Confirm the byte landed in COM1's output buffer — pins that
        // the dispatch wired to the right port mutex, not COM2.
        assert!(com1.lock().output().contains('Z'));
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn classify_exit_io_in_serial_is_continue() {
        // IoIn on COM1's data port returns a buffered byte if pending,
        // 0 otherwise. classify_exit must map IoIn → Continue
        // unconditionally — the run loop never terminates on a port
        // read.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut data = [0xFFu8; 1];
        let mut exit = VcpuExit::IoIn(console::COM1_BASE, &mut data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Continue)),
            "IoIn to COM1 must classify as Continue"
        );
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn classify_exit_x86_mmio_read_unmapped_returns_0xff() {
        // x86_64 MMIO read fallback: when a guest MMIO read addresses
        // a region that is NOT in any of the virtio MMIO windows, the
        // dispatch fills the data buffer with 0xFF (the canonical
        // "no device responded" pattern — matches PCI/MMIO DECODE
        // ERROR behaviour on real hardware). Pinning here so a future
        // refactor cannot accidentally leave the buffer untouched
        // (which would surface as the previous KVM_RUN's stale data
        // appearing in the guest as a phantom value).
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        // Pick an address well below any virtio_*_MMIO_BASE so no
        // device window matches it. 0x1000 is in the BIOS-EBDA area,
        // which ktstr does not back with any MMIO device.
        let mut buf = [0u8; 4];
        let mut exit = VcpuExit::MmioRead(0x1000, &mut buf);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Continue)),
            "Unmapped MMIO read must classify as Continue (not Fatal)"
        );
        assert_eq!(
            buf,
            [0xff, 0xff, 0xff, 0xff],
            "Unmapped MMIO read must fill the data buffer with 0xFF — \
             leaving stale bytes would surface as phantom guest reads"
        );
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn classify_exit_x86_mmio_write_unmapped_is_continue() {
        // x86_64 MMIO write fallback: an unmapped write is silently
        // dropped (no device matches → control falls through). Pin
        // that the action is Continue so a future change that
        // accidentally classifies "no virtio match" as Fatal would
        // break this test.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let data = [0xAAu8, 0xBB];
        let mut exit = VcpuExit::MmioWrite(0x1000, &data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Continue)),
            "Unmapped MMIO write must classify as Continue"
        );
    }
}

/// Arch-neutral classify_exit coverage. `classify_exit` is shared
/// across x86_64 and aarch64; these tests construct synthetic
/// `VcpuExit` values that exist on both arches and assert the
/// dispatch table's behavior. Kept in a separate `cfg(test)` module
/// (no arch gate) so the same coverage runs on both targets — the
/// surrounding x86_64-gated `mod tests` carries port-I/O and
/// I8042-reset tests that ARE arch-specific.
#[cfg(test)]
mod tests_arch_neutral {
    use super::*;

    #[test]
    fn classify_exit_hlt_returns_none() {
        // VcpuExit::Hlt is the AP-thread idle marker — classify_exit
        // returns None to signal "caller handles the kill-flag check
        // and continues". Pinning here so a future change that
        // accidentally maps Hlt to ExitAction::Continue (which would
        // skip the per-iteration kill recheck) is caught.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut exit = VcpuExit::Hlt;
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(action.is_none(), "Hlt must classify as None");
    }

    #[test]
    fn classify_exit_shutdown_variant_is_shutdown() {
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut exit = VcpuExit::Shutdown;
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Shutdown)),
            "Shutdown variant must classify as ExitAction::Shutdown"
        );
    }

    #[test]
    fn classify_exit_system_event_shutdown_is_shutdown() {
        // KVM_SYSTEM_EVENT_SHUTDOWN (1) is the PSCI-style clean shutdown
        // signal both arches surface for guest-initiated reboot. Must
        // classify as Shutdown so the run loop stops the BSP and APs.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let data: [u64; 0] = [];
        let mut exit = VcpuExit::SystemEvent(KVM_SYSTEM_EVENT_SHUTDOWN, &data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Shutdown)),
            "SystemEvent(SHUTDOWN=1) must classify as Shutdown"
        );
    }

    #[test]
    fn classify_exit_system_event_reset_is_shutdown() {
        // KVM_SYSTEM_EVENT_RESET (2) is the guest reboot signal. ktstr
        // treats reboot-as-shutdown — there is no warm-reboot path —
        // so reset and shutdown collapse to the same ExitAction.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let data: [u64; 0] = [];
        let mut exit = VcpuExit::SystemEvent(KVM_SYSTEM_EVENT_RESET, &data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Shutdown)),
            "SystemEvent(RESET=2) must classify as Shutdown"
        );
    }

    #[test]
    fn classify_exit_system_event_unknown_type_is_continue() {
        // Unknown SystemEvent types (CRASH=3, WAKEUP=4, future codes)
        // must NOT terminate the run loop — KVM may evolve the event
        // namespace, and ktstr must not falsely shut down on a value
        // it does not recognize. Pinning a non-{1,2} type returns
        // Continue so the BSP polls forward.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let data: [u64; 0] = [];
        // 99 is well outside KVM_SYSTEM_EVENT_{SHUTDOWN, RESET, CRASH,
        // WAKEUP, S2IDLE, SUSPEND} — picked to defend against future
        // expansion of the legitimate set without flipping this test.
        let mut exit = VcpuExit::SystemEvent(99, &data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Continue)),
            "SystemEvent with unknown type must classify as Continue, \
             not Shutdown — the run loop must not terminate on \
             unknown KVM event codes"
        );
    }

    #[test]
    fn classify_exit_fail_entry_is_fatal_with_reason() {
        // KVM_EXIT_FAIL_ENTRY surfaces a hardware-side entry failure
        // (typically VMX/SVM consistency check failed). The reason
        // must propagate through to ExitAction::Fatal so the failure
        // dump records the architectural code an operator can decode.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut exit = VcpuExit::FailEntry(0xdead_beef, 7);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        match action {
            Some(ExitAction::Fatal(Some(reason))) => assert_eq!(
                reason, 0xdead_beef,
                "FailEntry reason must round-trip into Fatal(Some(_))"
            ),
            other => panic!(
                "FailEntry must classify as Fatal(Some(reason)); got tag {}",
                action_tag(&other)
            ),
        }
    }

    #[test]
    fn classify_exit_internal_error_is_fatal_none() {
        // KVM_EXIT_INTERNAL_ERROR has no architectural reason code —
        // classify as Fatal(None) so the failure-dump emitter can
        // distinguish it from FailEntry and emit the correct prose.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut exit = VcpuExit::InternalError;
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Fatal(None))),
            "InternalError must classify as Fatal(None)"
        );
    }

    /// Helper: erase the inner data of an ExitAction so panic messages
    /// don't try to format the reason payload (Fatal carries Option<u64>).
    fn action_tag(a: &Option<ExitAction>) -> u8 {
        match a {
            None => 0,
            Some(ExitAction::Continue) => 1,
            Some(ExitAction::Shutdown) => 2,
            Some(ExitAction::Fatal(_)) => 3,
        }
    }
}

#[cfg(all(test, target_arch = "aarch64"))]
mod tests_aarch64 {
    use super::*;

    // -- aarch64 dispatch_mmio_read / dispatch_mmio_write coverage --
    //
    // dispatch_mmio_write is unit-typed (always returns) — the aarch64
    // shutdown signal arrives via VcpuExit::SystemEvent (PSCI), not an
    // MMIO write. dispatch_mmio_read fills with 0xFF when no device
    // window matches, so the guest sees a "no device responded" pattern
    // rather than stale bytes.

    #[test]
    fn dispatch_mmio_read_unmapped_returns_0xff() {
        // Pin the canonical "no device responded" fill at 0xFF for
        // unmapped MMIO reads. A future regression that left the
        // buffer untouched would surface as the previous KVM_RUN's
        // stale data appearing as a phantom guest read.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut buf = [0u8; 4];
        // 0x10_0000 is below SERIAL_MMIO_BASE (0x0900_0000) and inside
        // no virtio device window — picked so no early-return matches.
        dispatch_mmio_read(
            &com1, &com2, None, None, None, /* addr */ 0x10_0000, &mut buf,
        );
        assert_eq!(
            buf,
            [0xff, 0xff, 0xff, 0xff],
            "Unmapped MMIO read must fill the data buffer with 0xFF"
        );
    }

    #[test]
    fn dispatch_mmio_read_serial_does_not_fill_0xff() {
        // Reads inside the SERIAL_MMIO_BASE window go through the COM1
        // mutex — the 0xFF fallback fill must NOT run. Verifies the
        // serial branch is taken (data[0] is whatever the LSR returns;
        // we only assert "not 0xFF" because the precise LSR value is
        // a console-internal detail and not what this test pins).
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut buf = [0xAAu8; 1];
        // SERIAL_MMIO_BASE + 5 = LSR offset on the 8250 register map.
        // The serial branch overwrites buf[0] with the LSR value;
        // that value is non-0xFF for an idle Serial.
        dispatch_mmio_read(
            &com1,
            &com2,
            None,
            None,
            None,
            kvm::SERIAL_MMIO_BASE + 5,
            &mut buf,
        );
        assert_ne!(
            buf[0], 0xFF,
            "Serial MMIO read must invoke the COM1 LSR path, not the \
             unmapped 0xFF fallback"
        );
    }

    #[test]
    fn classify_exit_aarch64_mmio_write_serial_is_continue() {
        // aarch64 MmioWrite to the COM1 serial window must classify as
        // Continue (no MMIO-side shutdown signal — reboot is PSCI via
        // SystemEvent). The byte must reach the COM1 output buffer to
        // pin the dispatch wire-up.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let data = [b'Q'];
        let mut exit = VcpuExit::MmioWrite(kvm::SERIAL_MMIO_BASE, &data);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Continue)),
            "aarch64 MmioWrite to serial must classify as Continue"
        );
        assert!(
            com1.lock().output().contains('Q'),
            "Serial MMIO write must land the byte in COM1 output"
        );
    }

    #[test]
    fn classify_exit_aarch64_mmio_read_unmapped_returns_0xff() {
        // Cross-check: classify_exit + dispatch_mmio_read 0xFF fallback
        // through the public dispatch path. Unmapped MMIO read must be
        // Continue with the buffer overwritten to 0xFF.
        let com1 = PiMutex::new(console::Serial::new(console::COM1_BASE));
        let com2 = PiMutex::new(console::Serial::new(console::COM2_BASE));
        let mut buf = [0u8; 4];
        let mut exit = VcpuExit::MmioRead(0x10_0000, &mut buf);
        let action = classify_exit(&com1, &com2, None, None, None, &mut exit);
        assert!(
            matches!(action, Some(ExitAction::Continue)),
            "Unmapped aarch64 MMIO read must classify as Continue"
        );
        // Re-borrow `buf` to read it post-call — the &mut alias inside
        // `exit` is dropped when `exit` is, but we can re-inspect the
        // backing array here because classify_exit returned.
        assert_eq!(
            buf,
            [0xff, 0xff, 0xff, 0xff],
            "Unmapped aarch64 MMIO read must fill with 0xFF"
        );
    }
}

#[cfg(all(test, target_arch = "x86_64"))]
mod handle_freeze_tests {
    //! Tests for `handle_freeze` that do require a real KVM VcpuFd —
    //! they boot a minimal `KtstrKvm` (1 vCPU, 64 MB) so the freeze
    //! drain dance and reg-capture paths exercise real KVM ioctls.
    //!
    //! These tests skip the parking loop by leaving `freeze=false`,
    //! so the function falls straight through after the drain dance,
    //! reg capture, and parked Release-store. That covers:
    //!   - has_immediate_exit=true → vcpu.run() returns EINTR (the
    //!     KVM_CAP_IMMEDIATE_EXIT contract) and the drain swallows it
    //!   - capture_vcpu_regs writes a Some(_) to regs_slot
    //!   - parked AtomicBool gets a Release-then-Release toggle
    //!     (true on entry, false on exit)
    //!   - parked_evt.write(1) writes one edge

    use super::*;
    use std::sync::atomic::{AtomicBool, Ordering};

    #[test]
    fn handle_freeze_drain_swallows_eintr_and_resets_state() {
        // Boot a minimal VM, set up sregs so capture_vcpu_regs has a
        // valid CR3, then call handle_freeze. With freeze=false, the
        // park loop falls through; the drain dance runs because we
        // pass has_immediate_exit=true. The KVM_CAP_IMMEDIATE_EXIT
        // contract returns EINTR from vcpu.run() — handle_freeze logs
        // and continues, NOT panics.
        use crate::vmm::kvm::KtstrKvm;
        use crate::vmm::topology::Topology;
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let mut vm = KtstrKvm::new(topo, 64, false).unwrap();
        crate::vmm::x86_64::boot::setup_sregs(&vm.guest_mem, &vm.vcpus[0], false).unwrap();

        let kill = std::sync::Arc::new(AtomicBool::new(false));
        let freeze = std::sync::Arc::new(AtomicBool::new(false));
        let parked = std::sync::Arc::new(AtomicBool::new(false));
        let regs_slot = std::sync::Arc::new(std::sync::Mutex::new(None));

        // freeze=false → the park loop exits immediately; the function
        // returns after the drain + reg capture + parked toggle.
        handle_freeze(
            &mut vm.vcpus[0],
            true, // has_immediate_exit — exercise the EINTR drain path
            &kill,
            &freeze,
            &parked,
            &regs_slot,
            None,
            None,
            None,
        );

        // After handle_freeze returns:
        //   - parked must be false (Released on exit so subsequent
        //     freeze cycles are observable).
        //   - regs_slot must hold Some(_) — capture_vcpu_regs ran
        //     successfully on the freshly-init'd vCPU.
        assert!(
            !parked.load(Ordering::Acquire),
            "parked must be cleared on exit so subsequent freeze \
             cycles can observe a fresh true→false edge"
        );
        let snapshot = regs_slot.lock().unwrap();
        assert!(
            snapshot.is_some(),
            "capture_vcpu_regs must populate regs_slot for a freshly-\
             init'd vCPU — None means KVM_GET_REGS failed unexpectedly"
        );
    }

    #[test]
    fn handle_freeze_no_drain_when_immediate_exit_unsupported() {
        // With has_immediate_exit=false, the drain dance is skipped
        // entirely (the doc explains: calling vcpu.run() with the cap
        // absent would re-enter the guest instead of returning EINTR).
        // Verify the function still completes, captures regs, and
        // toggles parked correctly.
        use crate::vmm::kvm::KtstrKvm;
        use crate::vmm::topology::Topology;
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let mut vm = KtstrKvm::new(topo, 64, false).unwrap();
        crate::vmm::x86_64::boot::setup_sregs(&vm.guest_mem, &vm.vcpus[0], false).unwrap();

        let kill = std::sync::Arc::new(AtomicBool::new(false));
        let freeze = std::sync::Arc::new(AtomicBool::new(false));
        let parked = std::sync::Arc::new(AtomicBool::new(false));
        let regs_slot = std::sync::Arc::new(std::sync::Mutex::new(None));

        handle_freeze(
            &mut vm.vcpus[0],
            false, // has_immediate_exit=false → drain dance skipped
            &kill,
            &freeze,
            &parked,
            &regs_slot,
            None,
            None,
            None,
        );

        assert!(!parked.load(Ordering::Acquire));
        assert!(regs_slot.lock().unwrap().is_some());
    }

    #[test]
    fn handle_freeze_writes_parked_evt_edge() {
        // The parked_evt write is a wake edge for the freeze
        // coordinator's epoll loop. Verify exactly one byte/edge gets
        // written by reading back the eventfd counter post-call —
        // an EFD_NONBLOCK eventfd returns the accumulated count on
        // read and resets to 0.
        use crate::vmm::kvm::KtstrKvm;
        use crate::vmm::topology::Topology;
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        let mut vm = KtstrKvm::new(topo, 64, false).unwrap();
        crate::vmm::x86_64::boot::setup_sregs(&vm.guest_mem, &vm.vcpus[0], false).unwrap();

        let kill = std::sync::Arc::new(AtomicBool::new(false));
        let freeze = std::sync::Arc::new(AtomicBool::new(false));
        let parked = std::sync::Arc::new(AtomicBool::new(false));
        let regs_slot = std::sync::Arc::new(std::sync::Mutex::new(None));
        // EFD_NONBLOCK matches production usage in freeze_coord.
        let parked_evt = EventFd::new(vmm_sys_util::eventfd::EFD_NONBLOCK).unwrap();

        handle_freeze(
            &mut vm.vcpus[0],
            false,
            &kill,
            &freeze,
            &parked,
            &regs_slot,
            Some(&parked_evt),
            None,
            None,
        );

        // EventFd::read returns the accumulated counter and resets it.
        // We expect exactly 1 (one edge from the write(1) inside
        // handle_freeze).
        let counter = parked_evt.read().unwrap();
        assert_eq!(
            counter, 1,
            "handle_freeze must write exactly one wake edge to \
             parked_evt — coordinator depends on this to advance \
             from epoll_wait without spurious extra wakes"
        );
    }
}

#[cfg(test)]
mod vcpu_reg_snapshot_tests {
    use super::*;

    #[test]
    fn vcpu_reg_snapshot_display_renders_three_hex_fields() {
        // x86_64-shape snapshot (user_page_table_root=None): only
        // the three core hex fields render; no `uptroot=` suffix.
        let s = VcpuRegSnapshot {
            instruction_pointer: 0xffff_ffff_8100_1234,
            stack_pointer: 0xffff_ffff_8000_0000,
            page_table_root: 0x0123_4567_89ab_cdef,
            user_page_table_root: None,
            tcr_el1: None,
        };
        let out = format!("{s}");
        assert_eq!(
            out,
            "ip=0xffffffff81001234 sp=0xffffffff80000000 ptroot=0x0123456789abcdef"
        );
    }

    #[test]
    fn vcpu_reg_snapshot_display_appends_user_pt_root_when_present() {
        // aarch64-shape snapshot: user_page_table_root populated
        // → Display appends ` uptroot=0x...`. Pinning here so a
        // future Display tweak (e.g. swapping " " for "\n  ")
        // is caught.
        let s = VcpuRegSnapshot {
            instruction_pointer: 0xffff_8000_8100_1234,
            stack_pointer: 0xffff_8000_8000_0000,
            page_table_root: 0x0000_4000_8000_0000,
            user_page_table_root: Some(0x0000_0000_aaaa_bbbb),
            tcr_el1: Some(0xb510_0010),
        };
        let out = format!("{s}");
        assert_eq!(
            out,
            "ip=0xffff800081001234 sp=0xffff800080000000 ptroot=0x0000400080000000 uptroot=0x00000000aaaabbbb"
        );
    }

    #[test]
    fn vcpu_reg_snapshot_serde_round_trip() {
        let s = VcpuRegSnapshot {
            instruction_pointer: 0x1,
            stack_pointer: 0x2,
            page_table_root: 0x3,
            user_page_table_root: None,
            tcr_el1: None,
        };
        let json = serde_json::to_string(&s).expect("serialize");
        // Pin the JSON key names so a future field rename is
        // caught here rather than in downstream consumers
        // (operator JSON parsers, the failure_dump_e2e fixture).
        // Arch-neutral keys: see field doc on
        // VcpuRegSnapshot::page_table_root for the per-arch
        // semantics each one carries.
        assert!(
            json.contains("\"instruction_pointer\""),
            "missing JSON key `instruction_pointer`: {json}"
        );
        assert!(
            json.contains("\"stack_pointer\""),
            "missing JSON key `stack_pointer`: {json}"
        );
        assert!(
            json.contains("\"page_table_root\""),
            "missing JSON key `page_table_root`: {json}"
        );
        // user_page_table_root is None → serde-skipped via
        // skip_serializing_if = "Option::is_none"; assert it does
        // NOT appear in the JSON.
        assert!(
            !json.contains("\"user_page_table_root\""),
            "user_page_table_root must skip-serialize when None: {json}"
        );
        let parsed: VcpuRegSnapshot = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(parsed.instruction_pointer, 0x1);
        assert_eq!(parsed.stack_pointer, 0x2);
        assert_eq!(parsed.page_table_root, 0x3);
        assert!(
            parsed.user_page_table_root.is_none(),
            "missing field must deserialize as None"
        );
    }

    #[test]
    fn vcpu_reg_snapshot_serde_round_trip_with_user_pt_root() {
        // aarch64-shape: user_page_table_root populated → JSON
        // carries the key, deserialize round-trips the value.
        let s = VcpuRegSnapshot {
            instruction_pointer: 0x1,
            stack_pointer: 0x2,
            page_table_root: 0x3,
            user_page_table_root: Some(0xdead_beef_cafe_d00d),
            tcr_el1: None,
        };
        let json = serde_json::to_string(&s).expect("serialize");
        assert!(
            json.contains("\"user_page_table_root\""),
            "user_page_table_root must serialize when Some: {json}"
        );
        let parsed: VcpuRegSnapshot = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(parsed.user_page_table_root, Some(0xdead_beef_cafe_d00d));
    }

    #[test]
    fn vcpu_reg_snapshot_serde_round_trip_tcr_el1() {
        // Mirrors the user_page_table_root coverage but for tcr_el1:
        // skip_serializing_if = "Option::is_none" must drop the key
        // when None and emit + round-trip it when Some. Picks a
        // representative TCR_EL1 word (T1SZ=0x10 in [21:16] with
        // TG1=0b10 / 4 KB granule in [31:30]) so the test pins the
        // wire format the page-table walker actually sees on aarch64.
        let some_val: u64 = 0x0000_0000_b510_0010;

        let s_some = VcpuRegSnapshot {
            instruction_pointer: 0x1,
            stack_pointer: 0x2,
            page_table_root: 0x3,
            user_page_table_root: None,
            tcr_el1: Some(some_val),
        };
        let json_some = serde_json::to_string(&s_some).expect("serialize Some");
        assert!(
            json_some.contains("\"tcr_el1\""),
            "tcr_el1 must serialize when Some: {json_some}"
        );

        let s_none = VcpuRegSnapshot {
            instruction_pointer: 0x1,
            stack_pointer: 0x2,
            page_table_root: 0x3,
            user_page_table_root: None,
            tcr_el1: None,
        };
        let json_none = serde_json::to_string(&s_none).expect("serialize None");
        assert!(
            !json_none.contains("\"tcr_el1\""),
            "tcr_el1 must skip-serialize when None: {json_none}"
        );

        // Deserialize Some-flavour JSON back, assert value preserved.
        let parsed_some: VcpuRegSnapshot =
            serde_json::from_str(&json_some).expect("deserialize Some");
        assert_eq!(parsed_some.tcr_el1, Some(some_val));

        // Deserialize JSON without the key (None-flavour); the
        // serde(default) attribute must yield None rather than
        // failing because the field is absent.
        let parsed_none: VcpuRegSnapshot =
            serde_json::from_str(&json_none).expect("deserialize None");
        assert!(
            parsed_none.tcr_el1.is_none(),
            "missing tcr_el1 must deserialize as None"
        );
    }

    #[test]
    fn vcpu_reg_snapshot_zero_renders_zeros() {
        let s = VcpuRegSnapshot {
            instruction_pointer: 0,
            stack_pointer: 0,
            page_table_root: 0,
            user_page_table_root: None,
            tcr_el1: None,
        };
        // 16 hex digits each — leading zeros preserved so widths
        // line up across rows in multi-vcpu output.
        assert_eq!(
            format!("{s}"),
            "ip=0x0000000000000000 sp=0x0000000000000000 ptroot=0x0000000000000000"
        );
    }

    /// Pure-arithmetic test on the aarch64 KVM register IDs the
    /// capture path uses. Verifying the encoding here means a
    /// transcription bug (e.g. wrong byte offset, dropped flag,
    /// wrong sysreg op-code packing) would be caught without
    /// booting an aarch64 VM. Mirrors the kernel's
    /// `KVM_REG_ARM_CORE_REG(name) = offsetof(struct kvm_regs,
    /// name) / sizeof(__u32)` macro for core regs and the
    /// `ARM64_SYS_REG(Op0, Op1, CRn, CRm, Op2)` packing for
    /// sysregs (arch/arm64/include/uapi/asm/kvm.h).
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn aarch64_register_ids_match_kernel_encoding() {
        const KVM_REG_ARM64: u64 = 0x6000_0000_0000_0000;
        const KVM_REG_SIZE_U64: u64 = 0x0030_0000_0000_0000;
        const KVM_REG_ARM_CORE: u64 = 0x0010_0000;
        const KVM_REG_ARM64_SYSREG: u64 = 0x0013_0000;

        // PC at byte offset 256 in struct kvm_regs (= same offset
        // in user_pt_regs because user_pt_regs.regs is at offset
        // 0). 256 / 4 = 64.
        const EXPECTED_PC_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | 64;
        // SP_EL1 at byte offset 272 in struct kvm_regs (right
        // after the 272-byte user_pt_regs). 272 / 4 = 68.
        const EXPECTED_SP_EL1_ID: u64 = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | 68;
        // TTBR0_EL1 sysreg packing: Op0=3, Op1=0, CRn=2, CRm=0, Op2=0
        // → (3<<14) | (0<<11) | (2<<7) | (0<<3) | 0 = 0xC100.
        const EXPECTED_TTBR0_EL1_ID: u64 =
            KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC100;
        // TTBR1_EL1 sysreg packing: Op0=3, Op1=0, CRn=2, CRm=0, Op2=1
        // → (3<<14) | (0<<11) | (2<<7) | (0<<3) | 1 = 0xC101.
        const EXPECTED_TTBR1_EL1_ID: u64 =
            KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC101;

        // Reconstruct what capture_vcpu_regs declares to catch
        // any drift between the const declarations there and
        // the kernel ABI. The unsafe cast to *const u64 isn't
        // available across modules, so re-derive the values
        // here using the exact same expression form.
        let pc_id = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | (256 / 4);
        let sp_el1_id = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | (272 / 4);
        let ttbr0_el1_id = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC100;
        let ttbr1_el1_id = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG | 0xC101;

        assert_eq!(pc_id, EXPECTED_PC_ID, "PC_ID encoding drift");
        assert_eq!(
            sp_el1_id, EXPECTED_SP_EL1_ID,
            "SP_EL1_ID encoding drift — note offset is 272 (sp_el1), \
             not 248 (sp_el0)"
        );
        assert_eq!(
            ttbr0_el1_id, EXPECTED_TTBR0_EL1_ID,
            "TTBR0_EL1_ID encoding drift — verify (Op0=3, Op1=0, \
             CRn=2, CRm=0, Op2=0) packs to 0xC100"
        );
        assert_eq!(
            ttbr1_el1_id, EXPECTED_TTBR1_EL1_ID,
            "TTBR1_EL1_ID encoding drift — verify (Op0=3, Op1=0, \
             CRn=2, CRm=0, Op2=1) packs to 0xC101"
        );
        // Adjacency check: TTBR0 and TTBR1 differ only in Op2,
        // so the encoding must differ by exactly 1. Catches a
        // typo where one constant gets the other's value.
        assert_eq!(
            ttbr1_el1_id - ttbr0_el1_id,
            1,
            "TTBR0/TTBR1 encodings should differ by exactly 1 (Op2 bit)"
        );
    }

    /// DR7 wire format for a 4-byte write watchpoint in slot 0.
    ///
    /// The KVM hardware-watchpoint freeze trigger arms slot 0 of
    /// the guest's debug registers via `KVM_SET_GUEST_DEBUG` to
    /// catch writes to `sch->exit_kind`. The DR7 byte the VMM
    /// hands KVM must encode the exact slot/length/access pattern
    /// the watchpoint requires, otherwise either KVM rejects the
    /// configuration or the breakpoint catches the wrong access
    /// class — both surface as silent freeze-coordinator failure
    /// (the guest never traps, no failure dump).
    ///
    /// Field layout per Intel SDM Vol 3 Ch 17.2 ("Debug Registers")
    /// and pinned against the production
    /// [`super::super::vcpu::self_arm_watchpoint`] x86_64 path which
    /// composes DR7 as:
    ///
    ///   base = `0x400 | 0x200 | 0x100`  (MBS, GE, LE)
    ///   per slot i: `(0b11) << (2*i)`           (L<i> | G<i>)
    ///             | `(0b01) << (16 + 4*i)`     (R/W<i> = write)
    ///             | `(0b11) << (18 + 4*i)`     (LEN<i> = 4-byte)
    ///
    /// Bit-by-bit:
    ///
    ///   bit  0   L0   (local enable, slot 0) — 1
    ///   bit  1   G0   (global enable, slot 0) — 1
    ///   bit  8   LE   (local exact-match, required for data BPs)
    ///   bit  9   GE   (global exact-match, required for data BPs)
    ///   bit 10   reserved, must be 1 (DR7_FIXED_1)
    ///   bits [17:16]  R/W0 (00=exec, 01=write, 11=rw) — 01
    ///   bits [19:18]  LEN0 (00=1B, 01=2B, 10=8B, 11=4B) — 11
    ///
    /// Expected value 0xD0703 is what the production wire format
    /// emits for (slot=0, type=write, len=4). Pinning the arithmetic
    /// here means a future refactor that flips a bit (e.g. swaps R/W
    /// to exec, drops GE/LE, picks the wrong length encoding, drops
    /// L0 or G0) is caught at unit-test time before the trigger
    /// silently stops firing.
    #[test]
    #[cfg(target_arch = "x86_64")]
    fn dr7_slot0_write_4byte_encoding() {
        // Field constants — local to the test so a future
        // production-side rename does not silently divorce the
        // assertion from the wire format.
        const DR7_FIXED_1: u64 = 1 << 10;
        const DR_LOCAL_EXACT: u64 = 1 << 8; // LE — local exact-match
        const DR_GLOBAL_EXACT: u64 = 1 << 9; // GE — global exact-match
        const DR_LOCAL_ENABLE: u64 = 1 << 0; // L0 — slot 0 local
        const DR_GLOBAL_ENABLE: u64 = 1 << 1; // G0 — slot 0 global
        const DR_RW_WRITE: u64 = 0b01;
        const DR_LEN_4: u64 = 0b11;
        // Slot 0 occupies bits 16/17 for R/W and 18/19 for LEN.
        const SLOT0_RW_SHIFT: u32 = 16;
        const SLOT0_LEN_SHIFT: u32 = 18;

        let dr7 = DR7_FIXED_1
            | DR_GLOBAL_EXACT
            | DR_LOCAL_EXACT
            | DR_LOCAL_ENABLE
            | DR_GLOBAL_ENABLE
            | (DR_RW_WRITE << SLOT0_RW_SHIFT)
            | (DR_LEN_4 << SLOT0_LEN_SHIFT);
        // 0xD0703 mirrors what `self_arm_watchpoint` in
        // `super::super::vcpu` actually programs into KVM. A drift
        // here means the watchpoint's wire format diverged from the
        // production encoding and `KVM_SET_GUEST_DEBUG` will arm the
        // wrong breakpoint (or none).
        assert_eq!(
            dr7, 0xD0703,
            "DR7 encoding for (slot=0, write, 4B) must match the production wire format"
        );

        // Bit-by-bit cross-check: every contributing bit must be
        // present, and every other bit must be clear. Catches the
        // failure mode where two bugs cancel — e.g. wrong shift
        // for R/W combined with wrong shift for LEN that happen
        // to sum to the right total.
        assert_ne!(dr7 & (1 << 0), 0, "L0 (bit 0) must be set");
        assert_ne!(dr7 & (1 << 1), 0, "G0 (bit 1) must be set");
        assert_ne!(
            dr7 & (1 << 8),
            0,
            "LE (bit 8) must be set for data breakpoints"
        );
        assert_ne!(
            dr7 & (1 << 9),
            0,
            "GE (bit 9) must be set for data breakpoints"
        );
        assert_ne!(dr7 & (1 << 10), 0, "DR7_FIXED_1 (bit 10) must be set");
        // Slot 0 R/W field = 0b01 (write).
        assert_eq!(
            (dr7 >> SLOT0_RW_SHIFT) & 0b11,
            DR_RW_WRITE,
            "slot 0 R/W field must encode write (0b01)"
        );
        // Slot 0 LEN field = 0b11 (4 bytes).
        assert_eq!(
            (dr7 >> SLOT0_LEN_SHIFT) & 0b11,
            DR_LEN_4,
            "slot 0 LEN field must encode 4 bytes (0b11)"
        );
        // No other slot should be enabled.
        assert_eq!(
            dr7 & 0b1111_1100,
            0,
            "slots 1..3 must be disabled (L/G bits clear)"
        );
        // R/W and LEN fields for slots 1..3 must be zero.
        assert_eq!(
            (dr7 >> 20) & 0xFFF,
            0,
            "slots 1..3 R/W + LEN fields must be zero"
        );
    }

    /// DBGWCR wire format for a 4-byte write watchpoint at byte
    /// offset 0 of an 8-byte aligned block. The aarch64 sibling of
    /// the x86_64 [`dr7_slot0_write_4byte_encoding`] test above —
    /// pins the exact bit layout that
    /// [`super::super::vcpu::self_arm_watchpoint`] emits, so a future
    /// refactor that flips a bit (e.g. swaps LSC to read, drops PAC,
    /// picks the wrong BAS shift) is caught at unit-test time
    /// before the trigger silently stops firing.
    ///
    /// Field layout per ARM ARM D7.3.11 ("DBGWCR<n>_EL1, Debug
    /// Watchpoint Control Registers") and pinned against QEMU's
    /// `insert_hw_watchpoint` in `target/arm/hyp_gdbstub.c`:
    ///
    ///   bit 0       E   = 1 (enable)
    ///   bits [2:1]  PAC = 0b11 (EL0+EL1, any security state)
    ///   bits [4:3]  LSC = 0b10 (write-only)
    ///   bits [12:5] BAS = 0xF << byte_offset (4 contiguous bytes)
    ///   bits [15:13] HMC = 0
    ///   bits [19:16] SSC = 0
    ///   bit 20       WT = 0
    ///   bits [23:21] LBN = 0
    ///   bits [28:24] MASK = 0
    ///
    /// Concrete byte_offset=0 wire format: `0x1F7`.
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn dbgwcr_slot0_write_4byte_encoding_offset0() {
        let kva: u64 = 0xffff_ffff_8100_1000; // 8-byte aligned (low bits = 0)
        let byte_offset = (kva & 0x7u64) as u32;
        let bas: u64 = 0xFu64 << byte_offset;
        let wcr: u64 = 1u64 | (0b11u64 << 1) | (0b10u64 << 3) | (bas << 5);
        assert_eq!(
            wcr, 0x1F7,
            "DBGWCR encoding for (slot=0, write, 4B, offset=0) must \
             match the QEMU/ARM ARM gold-standard wire format"
        );
        // Bit-by-bit cross-check.
        assert_eq!(wcr & 1, 1, "E (bit 0) must be set");
        assert_eq!(
            (wcr >> 1) & 0b11,
            0b11,
            "PAC (bits 2:1) must be 0b11 (EL0+EL1)"
        );
        assert_eq!(
            (wcr >> 3) & 0b11,
            0b10,
            "LSC (bits 4:3) must be 0b10 (write-only)"
        );
        assert_eq!(
            (wcr >> 5) & 0xFF,
            0x0F,
            "BAS (bits 12:5) must be 0x0F for offset=0 (4 \
             contiguous low bytes)"
        );
        // No other fields must be set.
        assert_eq!(
            (wcr >> 13) & 0xF,
            0,
            "HMC + low SSC bit (bits 16:13) must be zero"
        );
        assert_eq!((wcr >> 20) & 0xF, 0, "WT + LBN must be zero");
        assert_eq!((wcr >> 24) & 0x1F, 0, "MASK must be zero");
    }

    /// DBGWCR encoding for a 4-byte write watchpoint at byte offset
    /// 4 of an 8-byte aligned block. Catches the failure mode where
    /// a 4-byte aligned but not 8-byte aligned KVA (e.g. a struct
    /// field at offset 4 inside an 8-byte aligned struct) gets the
    /// wrong BAS shift. Concrete wire format: `0x1E17`.
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn dbgwcr_slot0_write_4byte_encoding_offset4() {
        let kva: u64 = 0xffff_ffff_8100_1004; // 4-byte aligned, byte_offset=4
        let byte_offset = (kva & 0x7u64) as u32;
        let bas: u64 = 0xFu64 << byte_offset;
        let wcr: u64 = 1u64 | (0b11u64 << 1) | (0b10u64 << 3) | (bas << 5);
        assert_eq!(
            wcr, 0x1E17,
            "DBGWCR encoding for (slot=0, write, 4B, offset=4) must \
             match `0x1 | (3<<1) | (2<<3) | (0xF0 << 5)` = 0x1E17"
        );
        assert_eq!(
            (wcr >> 5) & 0xFF,
            0xF0,
            "BAS (bits 12:5) must be 0xF0 for offset=4 (4 \
             contiguous high bytes)"
        );
    }

    /// DBGWVR alignment: the WCR/WVR pair always uses an 8-byte
    /// aligned base and BAS to select the 4 bytes, so a 4-byte
    /// aligned KVA at offset 4 must yield WVR = `kva & ~0x7` (= the
    /// containing 8-byte block's base), not `kva` itself.
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn dbgwvr_8byte_aligned_base() {
        let kva: u64 = 0xffff_ffff_8100_1004;
        let wvr = kva & !0x7u64;
        assert_eq!(
            wvr, 0xffff_ffff_8100_1000,
            "DBGWVR base must clear the bottom 3 bits (8-byte align) \
             so BAS picks the 4 watched bytes within the block"
        );
        // Round-trip: reconstructing the watched range from WVR + BAS.
        let byte_offset = (kva & 0x7u64) as u32;
        let bas: u64 = 0xFu64 << byte_offset;
        let watched_lo = wvr + (bas.trailing_zeros() as u64);
        let watched_hi = watched_lo + (bas.count_ones() as u64);
        assert_eq!(
            watched_lo, kva,
            "watched range low must equal the original KVA"
        );
        assert_eq!(
            watched_hi,
            kva + 4,
            "watched range high must equal kva + 4 (4 bytes)"
        );
    }

    /// FAR-based slot decode for the aarch64 watchpoint exit path.
    /// Constructs a synthetic `kvm_debug_exit_arch` with EC =
    /// WATCHPT_LOW and a FAR inside slot 2's 4-byte window, runs
    /// the dispatch helper, and asserts only `user[1].hit` was
    /// latched (slot indices in `armed_slots` are 0=DR0/exit_kind,
    /// 1=DR1/user[0], 2=DR2/user[1], 3=DR3/user[2]). Also pins the
    /// post-fire single-step bookkeeping: the helper must mark
    /// `single_step_pending=true` and record the matched slot
    /// index, so the next `self_arm_watchpoint` call disables that
    /// slot's WCR.E and asserts `KVM_GUESTDBG_SINGLESTEP` (avoiding
    /// the aarch64 watchpoint-replay infinite loop).
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn watchpoint_slot_decode_from_far_user_slot() {
        use crate::vmm::vcpu::WatchpointArm;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");
        // Slot 0 unused (request_kva = 0); slots 1..=3 carry
        // distinct addresses 4 bytes apart so a FAR inside slot 2's
        // window matches exactly one slot.
        let armed_slots: [u64; 4] = [
            0,
            0xffff_ffff_8100_1000,
            0xffff_ffff_8100_1004,
            0xffff_ffff_8100_1008,
        ];
        // Construct a synthetic debug-exit payload pointing at the
        // first byte of slot 2 (DR2 / user[1]).
        let far = 0xffff_ffff_8100_1004u64;
        let hsr = (super::ESR_ELX_EC_WATCHPT_LOW) << super::ESR_ELX_EC_SHIFT;
        let debug_arch = kvm_bindings::kvm_debug_exit_arch {
            hsr,
            hsr_high: 0,
            far,
        };
        let mut single_step_pending = false;
        let mut single_step_slot: usize = 99;
        super::dispatch_watchpoint_hit(
            &watchpoint,
            &debug_arch,
            &armed_slots,
            &mut single_step_pending,
            &mut single_step_slot,
        );
        assert!(
            !watchpoint.hit.load(std::sync::atomic::Ordering::Acquire),
            "slot 0 (exit_kind) must not latch when a different \
             slot's range matches FAR"
        );
        assert!(
            !watchpoint.user[0]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[0] / slot 1 must not latch — FAR is inside slot 2's \
             range"
        );
        assert!(
            watchpoint.user[1]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[1] / slot 2 must latch — FAR equals the slot's KVA"
        );
        assert!(
            !watchpoint.user[2]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[2] / slot 3 must not latch — FAR is outside its \
             range"
        );
        // Single-step bookkeeping: a watchpoint match MUST request
        // single-step on the matching slot so the next KVM_RUN
        // advances past the offending store before the slot is
        // re-armed. `single_step_slot` is a 4-bit bitmap of
        // matched slot indices; with FAR inside slot 2's range,
        // bit 2 must be the only bit set so `self_arm_watchpoint`
        // clears WCR[2].E (and leaves peer slots armed) for the
        // single-step pass.
        assert!(
            single_step_pending,
            "single_step_pending must be set when a watchpoint match \
             latches; without this the next KVM_RUN replays the same \
             store and re-trips the watchpoint forever (ARM ARM \
             D2.10.5)"
        );
        assert_eq!(
            single_step_slot, 0b0100,
            "single_step_slot bitmap must encode slot 2 (bit 2 = 1, \
             0b0100) so self_arm_watchpoint clears WCR[2].E and \
             leaves WCR[0/1/3].E armed for the single-step pass"
        );
    }

    /// Non-watchpoint EC values must be ignored. KVM_EXIT_DEBUG can
    /// surface for soft-step (EC = 0x32) or BRK (EC = 0x3C); only
    /// EC = 0x34 (`ESR_ELx_EC_WATCHPT_LOW`) means a data watchpoint
    /// fired. Other ECs must not latch any slot. Soft-step (EC =
    /// 0x32) is treated specially when `single_step_pending` is
    /// set — it clears the flag so `self_arm_watchpoint` can
    /// restore the disabled slot's WCR.E. With the flag clear,
    /// soft-step exits are spurious and must NOT touch any state.
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn watchpoint_dispatch_ignores_non_watchpt_ec() {
        use crate::vmm::vcpu::WatchpointArm;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");
        let armed_slots: [u64; 4] = [
            0xffff_ffff_8100_1000,
            0xffff_ffff_8100_1004,
            0xffff_ffff_8100_1008,
            0xffff_ffff_8100_100C,
        ];
        // EC = 0x32 (software step) with single_step_pending = false
        // — must NOT latch and must not flip pending state.
        let hsr = 0x32u32 << super::ESR_ELX_EC_SHIFT;
        let debug_arch = kvm_bindings::kvm_debug_exit_arch {
            hsr,
            hsr_high: 0,
            far: 0xffff_ffff_8100_1004,
        };
        let mut single_step_pending = false;
        let mut single_step_slot: usize = 99;
        super::dispatch_watchpoint_hit(
            &watchpoint,
            &debug_arch,
            &armed_slots,
            &mut single_step_pending,
            &mut single_step_slot,
        );
        assert!(
            !watchpoint.hit.load(std::sync::atomic::Ordering::Acquire),
            "soft-step EC must not latch slot 0"
        );
        for (i, slot) in watchpoint.user.iter().enumerate() {
            assert!(
                !slot.hit.load(std::sync::atomic::Ordering::Acquire),
                "soft-step EC must not latch user[{i}]"
            );
        }
        assert!(
            !single_step_pending,
            "spurious soft-step exit (no pending step) must leave \
             single_step_pending unchanged"
        );
        assert_eq!(
            single_step_slot, 99,
            "spurious soft-step exit must not clobber single_step_slot"
        );
    }

    /// Soft-step exit AFTER a watchpoint trap is the second half of
    /// the aarch64 watchpoint-replay-avoidance dance: it signals
    /// "the offending store has retired, you may rearm the slot
    /// now". The dispatch helper must clear `single_step_pending`
    /// (so the next `self_arm_watchpoint` call restores WCR.E=1)
    /// AND must not latch any new `hit` (the original
    /// WATCHPT_LOW exit already latched the freeze trigger).
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn watchpoint_softstep_clears_single_step_pending() {
        use crate::vmm::vcpu::WatchpointArm;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");
        let armed_slots: [u64; 4] = [0, 0xffff_ffff_8100_1000, 0, 0];
        let hsr = super::ESR_ELX_EC_SOFTSTP_LOW << super::ESR_ELX_EC_SHIFT;
        let debug_arch = kvm_bindings::kvm_debug_exit_arch {
            hsr,
            hsr_high: 0,
            far: 0,
        };
        let mut single_step_pending = true;
        let mut single_step_slot: usize = 1;
        super::dispatch_watchpoint_hit(
            &watchpoint,
            &debug_arch,
            &armed_slots,
            &mut single_step_pending,
            &mut single_step_slot,
        );
        assert!(
            !single_step_pending,
            "SOFTSTP_LOW with pending step must clear \
             single_step_pending so the next self_arm_watchpoint \
             call restores WCR.E=1 and drops KVM_GUESTDBG_SINGLESTEP"
        );
        assert!(
            !watchpoint.hit.load(std::sync::atomic::Ordering::Acquire),
            "SOFTSTP_LOW must not latch slot 0 (the WATCHPT_LOW \
             exit that preceded it already did)"
        );
        for (i, slot) in watchpoint.user.iter().enumerate() {
            assert!(
                !slot.hit.load(std::sync::atomic::Ordering::Acquire),
                "SOFTSTP_LOW must not latch user[{i}]"
            );
        }
    }

    /// Slot 0 (`exit_kind`) MUST NOT latch when `kind_host_ptr`
    /// is null. The freeze coordinator publishes `kind_host_ptr`
    /// (Release) BEFORE `request_kva` (Release); the vCPU's
    /// `self_arm_watchpoint` only programs the hardware
    /// watchpoint after observing a non-zero `request_kva` via
    /// an Acquire load — at which point both stores are visible.
    /// A null observation here is a publication-invariant
    /// violation. Pinning the no-latch behavior so a future
    /// regression that re-introduces an unconditional fallback
    /// (e.g. "latch hit just in case") is caught: latching on
    /// null defeats the post-store error-class gate and re-fires
    /// dump emission on every clean SCX_EXIT_DONE shutdown.
    /// The single-step bookkeeping STILL fires on the matched
    /// slot — re-entering KVM_RUN without stepping past the
    /// offending store would replay the same trap forever
    /// regardless of whether the slot-0 latch ran.
    #[test]
    #[cfg(target_arch = "aarch64")]
    fn watchpoint_slot0_skips_latch_when_host_ptr_null() {
        use crate::vmm::vcpu::WatchpointArm;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");
        // host_ptr is null by construction (`AtomicPtr::new(null)`).
        let armed_slots: [u64; 4] = [0xffff_ffff_8100_1000, 0, 0, 0];
        let hsr = (super::ESR_ELX_EC_WATCHPT_LOW) << super::ESR_ELX_EC_SHIFT;
        let debug_arch = kvm_bindings::kvm_debug_exit_arch {
            hsr,
            hsr_high: 0,
            far: 0xffff_ffff_8100_1000,
        };
        let mut single_step_pending = false;
        let mut single_step_slot: usize = 0;
        super::dispatch_watchpoint_hit(
            &watchpoint,
            &debug_arch,
            &armed_slots,
            &mut single_step_pending,
            &mut single_step_slot,
        );
        assert!(
            !watchpoint.hit.load(std::sync::atomic::Ordering::Acquire),
            "slot 0 must NOT latch hit when kind_host_ptr is null — \
             a null observation is a publication-invariant violation, \
             not a fallback trigger"
        );
        // Single-step bookkeeping must still record the matched
        // slot in the bitmap so `self_arm_watchpoint` clears
        // WCR.E on it. Without this the offending store replays
        // forever on re-entering KVM_RUN.
        assert!(
            single_step_pending,
            "single_step_pending must be set when the FAR matches a \
             slot, regardless of slot-0 latch outcome"
        );
        assert_eq!(
            single_step_slot, 0b0001,
            "single_step_slot bitmap must include slot 0 (bit 0 = 1) \
             so self_arm_watchpoint clears WCR[0].E during the \
             single-step pass"
        );
    }

    /// x86_64 sibling of `watchpoint_slot_decode_from_far_user_slot`.
    /// Constructs a synthetic `kvm_debug_exit_arch` with DR6=0x4 (B2
    /// set, indicating DR2 fired) and verifies the dispatch helper
    /// latches `user[1].hit` (slot 2 → user[1]) and leaves slot 0
    /// and the other user slots untouched. Pins the DR6→slot
    /// mapping so a future refactor that flips the bit-shift
    /// (e.g. interprets bit 2 as slot 1 instead of slot 2) is
    /// caught at unit-test time.
    #[test]
    #[cfg(target_arch = "x86_64")]
    fn watchpoint_dispatch_x86_dr6_b2_latches_user_slot_1() {
        use crate::vmm::vcpu::WatchpointArm;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");
        // armed_slots is consumed by the aarch64 path only; x86
        // uses DR6 alone.
        let armed_slots: [u64; 4] = [0; 4];
        let debug_arch = kvm_bindings::kvm_debug_exit_arch {
            exception: 0,
            pad: 0,
            pc: 0,
            // DR6 bit 2 (B2) set ⇒ DR2 fired (slot 2 in our
            // 0=DR0..3=DR3 indexing). Other bits cleared per Intel
            // SDM Vol. 3B 17.2.5.
            dr6: 0x4,
            dr7: 0,
        };
        let mut single_step_pending = false;
        let mut single_step_slot: usize = 99;
        super::dispatch_watchpoint_hit(
            &watchpoint,
            &debug_arch,
            &armed_slots,
            &mut single_step_pending,
            &mut single_step_slot,
        );
        assert!(
            !watchpoint.hit.load(std::sync::atomic::Ordering::Acquire),
            "slot 0 (exit_kind) must not latch when DR6 B0 is clear"
        );
        assert!(
            !watchpoint.user[0]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[0] / slot 1 must not latch — DR6 B1 is clear"
        );
        assert!(
            watchpoint.user[1]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[1] / slot 2 must latch — DR6 B2 is set"
        );
        assert!(
            !watchpoint.user[2]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[2] / slot 3 must not latch — DR6 B3 is clear"
        );
        // x86 single-step bookkeeping is inert (the trap is taken
        // AFTER the offending store retires — Intel SDM 17.2.4).
        assert!(
            !single_step_pending,
            "x86 dispatch must never set single_step_pending — \
             single-step is aarch64-only"
        );
        assert_eq!(
            single_step_slot, 99,
            "x86 dispatch must not clobber single_step_slot — \
             single-step is aarch64-only"
        );
    }

    /// x86_64 multi-match: DR6=0x5 (B0 + B2) means DR0 and DR2
    /// both fired on the same exit. The dispatch helper must
    /// latch BOTH slot 0 and user[1] from a single
    /// `kvm_debug_exit_arch`. Catches a refactor that breaks at
    /// the first set bit (e.g. early `return` after `if hits[0]`).
    #[test]
    #[cfg(target_arch = "x86_64")]
    fn watchpoint_dispatch_x86_dr6_multi_match() {
        use crate::vmm::vcpu::WatchpointArm;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");
        // Slot 0 needs a non-null kind_host_ptr to latch via the
        // post-store value gate. Pre-arm it with a host-side u32
        // holding an error-class kind value so
        // `latch_slot0_with_gate` takes the latch branch instead
        // of the gated-suppression branch.
        let kind: u32 = super::SCX_EXIT_ERROR_THRESHOLD;
        let kind_box = Box::new(kind);
        let kind_ptr = Box::into_raw(kind_box);
        watchpoint
            .kind_host_ptr
            .store(kind_ptr, std::sync::atomic::Ordering::Release);
        let armed_slots: [u64; 4] = [0; 4];
        let debug_arch = kvm_bindings::kvm_debug_exit_arch {
            exception: 0,
            pad: 0,
            pc: 0,
            // DR6 bits 0 + 2 set: DR0 and DR2 both fired.
            dr6: 0x5,
            dr7: 0,
        };
        let mut single_step_pending = false;
        let mut single_step_slot: usize = 99;
        super::dispatch_watchpoint_hit(
            &watchpoint,
            &debug_arch,
            &armed_slots,
            &mut single_step_pending,
            &mut single_step_slot,
        );
        assert!(
            watchpoint.hit.load(std::sync::atomic::Ordering::Acquire),
            "slot 0 (exit_kind) must latch — DR6 B0 set + kind ≥ \
             SCX_EXIT_ERROR_THRESHOLD"
        );
        assert!(
            !watchpoint.user[0]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[0] / slot 1 must not latch — DR6 B1 is clear"
        );
        assert!(
            watchpoint.user[1]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[1] / slot 2 must latch — DR6 B2 is set, even \
             though slot 0 latched first in iteration order"
        );
        assert!(
            !watchpoint.user[2]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "user[2] / slot 3 must not latch — DR6 B3 is clear"
        );
        // SAFETY: kind_box ownership round-trip via Box::into_raw
        // / Box::from_raw matches the standard pattern; we own
        // the only pointer to this allocation in the test
        // function and the dispatch helper has finished its
        // read_volatile before this drop.
        let _ = unsafe { Box::from_raw(kind_ptr) };
    }

    /// CAS-based dedup in `latch_hit`: a second call (e.g. two
    /// vCPUs each writing the watched address, or a re-fire on
    /// the same vCPU before the freeze coordinator resets `hit`)
    /// must not write a second eventfd edge. Pinning the dedup so
    /// a future refactor that drops the CAS in favour of an
    /// unconditional `store(true)` is caught here — coordinator
    /// would then rendezvous N times for one logical fire.
    #[test]
    fn latch_hit_is_idempotent_across_repeat_calls() {
        use crate::vmm::vcpu::WatchpointArm;
        use std::os::fd::AsRawFd;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");

        // First latch: must flip false→true and write eventfd.
        watchpoint.latch_hit();
        assert!(
            watchpoint.hit.load(std::sync::atomic::Ordering::Acquire),
            "first latch_hit must flip hit=false→true"
        );

        // Drain the eventfd to verify there is exactly one write
        // pending. EFD_NONBLOCK + counter mode: a single read
        // returns the accumulated count and resets the internal
        // counter.
        let mut buf = [0u8; 8];
        let n = unsafe {
            libc::read(
                watchpoint.hit_evt.as_raw_fd(),
                buf.as_mut_ptr() as *mut libc::c_void,
                buf.len(),
            )
        };
        assert_eq!(
            n, 8,
            "first latch_hit must produce one eventfd edge \
             (8-byte counter read)"
        );
        let count_after_first = u64::from_ne_bytes(buf);
        assert_eq!(
            count_after_first, 1,
            "first latch_hit must increment counter by exactly 1"
        );

        // Second latch (without coordinator reset): CAS fails,
        // no eventfd write. Counter stays at 0 — a subsequent
        // read returns EAGAIN (no edge available).
        watchpoint.latch_hit();
        let mut buf2 = [0u8; 8];
        let n2 = unsafe {
            libc::read(
                watchpoint.hit_evt.as_raw_fd(),
                buf2.as_mut_ptr() as *mut libc::c_void,
                buf2.len(),
            )
        };
        let errno = unsafe { *libc::__errno_location() };
        assert!(
            n2 == -1 && errno == libc::EAGAIN,
            "second latch_hit on already-latched slot must NOT \
             write to hit_evt (cross-vCPU dedup); read should \
             return EAGAIN, got n={n2} errno={errno}"
        );
    }

    /// CAS-based dedup in `latch_user_hit`: same invariant as the
    /// slot-0 latch, applied to user slots 1..=3. Catches an off-
    /// by-one in the per-slot CAS (e.g. CAS on slot 0's `hit`
    /// instead of `user[idx].hit`).
    #[test]
    fn latch_user_hit_is_idempotent_across_repeat_calls() {
        use crate::vmm::vcpu::WatchpointArm;
        use std::os::fd::AsRawFd;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");

        watchpoint.latch_user_hit(1);
        assert!(
            watchpoint.user[1]
                .hit
                .load(std::sync::atomic::Ordering::Acquire),
            "first latch_user_hit(1) must flip user[1].hit=false→true"
        );

        let mut buf = [0u8; 8];
        let n = unsafe {
            libc::read(
                watchpoint.hit_evt.as_raw_fd(),
                buf.as_mut_ptr() as *mut libc::c_void,
                buf.len(),
            )
        };
        assert_eq!(n, 8, "first latch_user_hit(1) must write eventfd");
        let count_after_first = u64::from_ne_bytes(buf);
        assert_eq!(count_after_first, 1, "counter increment must be 1");

        watchpoint.latch_user_hit(1);
        let mut buf2 = [0u8; 8];
        let n2 = unsafe {
            libc::read(
                watchpoint.hit_evt.as_raw_fd(),
                buf2.as_mut_ptr() as *mut libc::c_void,
                buf2.len(),
            )
        };
        let errno = unsafe { *libc::__errno_location() };
        assert!(
            n2 == -1 && errno == libc::EAGAIN,
            "second latch_user_hit(1) on already-latched slot \
             must NOT write to hit_evt; read should return \
             EAGAIN, got n={n2} errno={errno}"
        );

        // Out-of-range idx must be a silent no-op — no panic, no
        // hit on any slot, no eventfd write. Catches a future
        // refactor that drops the bounds check.
        watchpoint.latch_user_hit(99);
        for (i, slot) in watchpoint.user.iter().enumerate() {
            if i == 1 {
                assert!(
                    slot.hit.load(std::sync::atomic::Ordering::Acquire),
                    "user[1].hit must remain latched"
                );
            } else {
                assert!(
                    !slot.hit.load(std::sync::atomic::Ordering::Acquire),
                    "user[{i}].hit must remain unlatched after \
                     out-of-range latch_user_hit(99)"
                );
            }
        }
    }

    /// `WatchpointArm::mark_armed` flips the `any_armed` gate
    /// from 0 to 1 and is idempotent. Pins the gate's role: the
    /// publisher (freeze coordinator's err_exit publish or
    /// `arm_user_watchpoint`) calls `mark_armed` AFTER the
    /// `Release` store on `request_kva`; until then
    /// `self_arm_watchpoint` short-circuits without doing the
    /// per-slot Acquire reads.
    #[test]
    fn mark_armed_flips_gate_and_is_idempotent() {
        use crate::vmm::vcpu::WatchpointArm;
        let watchpoint = WatchpointArm::new().expect("WatchpointArm::new");
        assert_eq!(
            watchpoint
                .any_armed
                .load(std::sync::atomic::Ordering::Relaxed),
            0,
            "newly-constructed WatchpointArm must have any_armed=0 \
             so self_arm_watchpoint short-circuits before any \
             publisher fires"
        );
        watchpoint.mark_armed();
        assert_eq!(
            watchpoint
                .any_armed
                .load(std::sync::atomic::Ordering::Relaxed),
            1,
            "first mark_armed call must flip the gate to 1"
        );
        // Idempotence: a second call must leave the gate at 1.
        // Catches a refactor that turned mark_armed into an
        // increment / fetch_add (which would saturate eventually
        // but burn cycles on every publisher call).
        watchpoint.mark_armed();
        assert_eq!(
            watchpoint
                .any_armed
                .load(std::sync::atomic::Ordering::Relaxed),
            1,
            "second mark_armed call must leave the gate at 1 \
             (idempotent — mark_armed is `store(1)`, not `+= 1`)"
        );
    }
}