ud-emulator 0.1.5

Pure-Rust 32-bit x86 emulator + PE runtime loader + Win32 host shims. Mirrors oxideav-vfw; intended to grow into the dynamic-analysis backend that informs decompilation (indirect-target recovery, constant-data discovery).
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
//! Win32 stub registry + per-DLL host implementations of the
//! functions the loaded codec DLLs import.
//!
//! Each stub is a Rust function pointer with the signature
//! [`StubFn`]. The PE loader, when populating the IAT, looks up
//! `(dll_name_lowercased, function_name)` in [`Registry`] and
//! writes the synthetic `StubAddr` (a guest address that lives
//! in the unmapped "thunk space" near `0xFFFE_0000`) into the
//! IAT slot.
//!
//! At call time, the integer ISA executor sees `eip` jump to a
//! thunk address. It detects this via [`Registry::is_thunk`]
//! and dispatches to the stub directly, popping the right number
//! of bytes off the guest stack for the calling convention.
//!
//! All stubs are stdcall (callee-cleanup) for round 1; the
//! `arg_dwords` field carries the count. Round-2 will add cdecl
//! (caller-cleanup) once vfw32 needs it.
//!
//! Reference for each function: the corresponding MSDN page
//! (linked in source comments next to each stub).

use std::collections::BTreeMap;

use crate::emulator::{Cpu, Mmu};

pub mod advapi32;
pub mod comctl32;
pub mod gdi32;
pub mod kernel32;
pub mod mfplat;
pub mod msi;
pub mod msvcrt;
pub mod ole32;
pub mod shell32;
pub mod shlwapi;
pub mod user32;
pub mod version;
pub mod vfw32;
pub mod winmm;

/// First synthetic thunk address. Chosen well above any plausible
/// `ImageBase + section.VirtualAddress` so it cannot be mistaken
/// for a real DLL byte. Each registered stub gets the next
/// 16-byte slot.
pub const THUNK_BASE: u32 = 0xFFFE_0000;
const THUNK_STRIDE: u32 = 16;

/// Signature every Win32 stub uses.
///
/// Returns the dword to put in `eax` on return. The stub
/// internally reads its arguments off the guest stack via the
/// [`Cpu`] / [`Mmu`] handles. The runtime takes care of popping
/// `arg_dwords * 4` bytes from the guest stack after the stub
/// returns (stdcall callee-cleanup).
///
/// `&Registry` is passed so a stub can re-enter the run-loop to
/// call back into the guest (used by the round-2 `vfw32` stub
/// surface, which has to dispatch the codec DLL's `DriverProc`
/// before returning to the IAT caller).
pub type StubFn = fn(&mut Cpu, &mut Mmu, &mut HostState, &Registry) -> Result<u32, Win32Error>;

/// One stub call recorded for analysis. Populated whenever
/// [`HostState::trace_stubs`] is set; the [`HostState::stub_calls`]
/// vector accumulates these in call order.
#[derive(Clone, Debug)]
pub struct StubCall {
    /// The DLL the call targeted (`"kernel32.dll"`, …).
    pub dll: String,
    /// The function name (`"CreateFileA"`, …).
    pub name: String,
    /// Dword arguments captured off the guest stack at call
    /// entry, before the stub ran. Length is the stdcall
    /// `arg_dwords` count, or a per-call override for known
    /// cdecl shapes.
    pub args: Vec<u32>,
    /// Whatever `eax` value the stub returned.
    pub ret: u32,
    /// Call-site EIP — the saved return address on the guest
    /// stack at call entry, i.e. the instruction the codec
    /// will resume at when the stub returns.
    pub call_site_eip: u32,
}

/// Information stored alongside each stub.
#[derive(Clone)]
pub struct StubEntry {
    pub dll: String,
    pub name: String,
    pub func: StubFn,
    /// Number of dword arguments to pop off the stack (stdcall
    /// callee-cleanup). cdecl callers will be added in round 2
    /// with a separate flag.
    pub arg_dwords: u32,
    /// The synthetic guest address that, when called, invokes
    /// this stub.
    pub thunk_addr: u32,
}

/// Errors a stub can raise. Wrapped in `crate::Error::Win32`.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Win32Error {
    /// No stub registered for the requested `(dll, name)` pair.
    /// PE-load-time error; surfaces from
    /// `crate::pe::Loader::resolve_imports`.
    UnknownImport { dll: String, name: String },
    /// Stub-side argument validation failed.
    InvalidArgument { stub: &'static str, reason: String },
    /// Heap call referenced an unknown allocation.
    InvalidHeapBlock { stub: &'static str, addr: u32 },
    /// The per-run instruction budget set on
    /// [`HostState::instruction_budget`] was exhausted before
    /// the guest reached `RET_SENTINEL`. Analysis front-ends
    /// use this to cap adversarial samples that loop. The
    /// state captured up to the budget point — coverage
    /// map, stub trace, register snapshot — is still valid.
    BudgetExhausted { executed: u64 },
}

impl core::fmt::Display for Win32Error {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            Win32Error::UnknownImport { dll, name } => {
                write!(f, "no Round-1 stub for import {dll}!{name}")
            }
            Win32Error::InvalidArgument { stub, reason } => {
                write!(f, "{stub}: {reason}")
            }
            Win32Error::InvalidHeapBlock { stub, addr } => {
                write!(f, "{stub}: unknown heap allocation {addr:#010x}")
            }
            Win32Error::BudgetExhausted { executed } => {
                write!(
                    f,
                    "instruction budget exhausted after {executed} steps without reaching RET_SENTINEL"
                )
            }
        }
    }
}

/// One entry in the open-codec table — a "Handle to Installable
/// Compressor" in MSDN's vfw32 vocabulary.
#[derive(Debug, Clone)]
pub struct HicEntry {
    /// 4-byte fcc type ('VIDC' for video).
    pub fcc_type: u32,
    /// 4-byte fcc handler ('cvid' for Cinepak, 'IV50' for Indeo 5).
    pub fcc_handler: u32,
    /// Open mode (vfw.h: 1 = ICMODE_COMPRESS, 2 = ICMODE_DECOMPRESS, …).
    pub mode: u32,
    /// VA of the codec DLL's `DriverProc` export (the entry point
    /// that every IC* call dispatches into).
    pub driver_proc_va: u32,
    /// `dwDriverId` to pass back to `DriverProc` on every call —
    /// the value `DriverProc(_, _, DRV_OPEN, _, _)` returned.
    pub driver_id: u32,
}

/// Per-process emulator state. Phase 1 of the scheduler refactor
/// (see `magical-popping-oasis` plan): every field that is
/// conceptually scoped to a single Win32 *process* lives here.
/// Today there is exactly one [`ProcessState`] per [`HostState`];
/// Phase 5 will switch this to a `BTreeMap<pid, ProcessState>`
/// indexed by an `active_pid` cursor, which keeps stub bodies
/// unchanged (they reach state through [`HostState`]'s
/// auto-deref).
///
/// The conceptual boundary: anything a child process spawned via
/// `CreateProcessA` should NOT share with its parent goes here.
/// Anything truly shared (virtual filesystem, virtual registry,
/// host-side trace buffers, the clock, the instruction budget)
/// stays on [`HostState`].
#[derive(Default)]
pub struct ProcessState {
    /// Synthetic process identifier. `1` for the bootstrap
    /// process; `CreateProcessA` mints monotonically
    /// increasing values.
    pub pid: u32,
    /// PID of the process that called `CreateProcessA` to spawn
    /// this one. `0` for the bootstrap (no parent).
    pub parent_pid: u32,
    /// Image base where the process's primary PE is mapped.
    /// Each process gets a unique base so child PEs don't
    /// collide with the parent — `0x00400000` for the bootstrap,
    /// `0x10000000` / `0x20000000` / … for spawned children.
    pub image_base: u32,
    /// Exit code reported by `ExitProcess` / `TerminateProcess`
    /// on this process; `None` while the process is still
    /// running. Wakes any `WaitForSingleObject` on the process
    /// handle when set.
    pub exit_code: Option<u32>,
    /// Heap allocations keyed by guest address.
    pub heap: BTreeMap<u32, Vec<u8>>,
    /// Cursor for the next heap allocation. Walks through a
    /// dedicated guest-virtual region (configured by [`HostState::new`]).
    pub heap_cursor: u32,
    pub heap_arena_end: u32,
    /// Default process heap handle returned by `GetProcessHeap`.
    pub process_heap_handle: u32,
    /// Loaded-module registry: name → ImageBase.
    pub modules: BTreeMap<String, u32>,
    /// Most-recently-loaded codec module's image base — returned
    /// by `GetModuleHandleA(NULL)`. Set to 0 if no DLL has been
    /// loaded yet.
    pub primary_module_base: u32,
    /// Open codec handles. Synthesised inside the host (no codec
    /// guest memory is consumed); each handle is a small integer
    /// the codec sees as an `HIC`.
    pub hics: BTreeMap<u32, HicEntry>,
    /// Counter for the next synthetic HIC. Starts at 1; 0 means
    /// "open failed".
    pub next_hic: u32,
    /// Default `DriverProc` VA used when a host caller invokes an
    /// `IC*` stub but has not staged a real codec image (i.e. for
    /// the no-fixture unit tests). Set to 0 when no codec is
    /// loaded — `ICOpen` then refuses to mint a HIC.
    pub default_driver_proc: u32,
    /// Set by `kernel32!ExitProcess` (and `TerminateProcess` on
    /// the current process) to break out of the emulator loop in
    /// lieu of unwinding to `RET_SENTINEL`. `Some(code)` means
    /// "this process asked to terminate"; the run-loop converts
    /// this into a clean return so the calling host code can
    /// introspect what happened.
    pub exit_requested: Option<u32>,
    /// Read-only constant-data arena. Used by stubs like
    /// `GetCommandLineA` / `GetEnvironmentStrings` that need to
    /// hand out stable guest pointers to canned strings. The
    /// slab grows by `arena_const_alloc` and lives at
    /// `[const_arena_start, const_arena_end)`. Configured by
    /// [`HostState::new`] like the heap arena.
    pub const_arena_cursor: u32,
    pub const_arena_end: u32,
    /// Cached pointer to the canned `"oxideav-vfw\0"` command
    /// line. Lazily populated by `GetCommandLineA`.
    pub command_line_ptr: u32,
    /// Cached pointer to the canned empty environment block.
    pub environment_strings_ptr: u32,
    /// Currently-live `HDC` values handed out by
    /// `gdi32!CreateCompatibleDC` / `user32!GetDC`. `None` until
    /// the first DC is allocated, then a populated set.
    pub gdi_hdcs: Option<std::collections::BTreeSet<u32>>,
    /// Round 26 — synthetic `HWND` registry. `CreateWindowExA`
    /// hands out `HWND_BASE + n` values; `IsWindow` consults this
    /// set; `DestroyWindow` removes from it. None of these HWNDs
    /// back a real window — DirectShow / VfW codecs only need the
    /// `HWND` value to feel non-NULL so they fall through to
    /// their headless code path.
    pub hwnd_registry: std::collections::BTreeSet<u32>,
    /// Counter for the next synthetic HWND allocation. Starts at
    /// 0; first HWND handed out is `HWND_BASE + 0`.
    pub next_hwnd_index: u32,
    /// Set of `DriverProc` VAs that have already received the
    /// one-time `DRV_LOAD` + `DRV_ENABLE` initialisation pair.
    /// Round 11 — without this, `IR50_32.DLL`'s `DRV_LOAD` handler
    /// (which allocates the codec's huffman / inverse-DCT tables
    /// at `[0x1009c770]`) never runs, and `ICDecompress` later
    /// reads `[0x1009c770] == NULL` and bails with
    /// `ICERR_BADIMAGE`. We track per-VA so multi-codec sandboxes
    /// (round 12+) don't double-load the same driver.
    pub loaded_drivers: std::collections::BTreeSet<u32>,
    /// Per-loaded-module resource directory location:
    /// `image_base → resource_dir_va`. Empty if the module has no
    /// `.rsrc` (PE Data Directory entry 2). Round 12 needs this so
    /// that `kernel32!FindResourceA` on `IR50_32.DLL` can locate
    /// the RT_BITMAP/112 entry that holds the codec's huffman /
    /// inverse-DCT tables. Without it the codec's `DRV_LOAD`
    /// chain bails at `0x10034d31 (jz 0x10034f61)` and
    /// `[0x1009c770]` stays NULL.
    pub module_resource_dirs: BTreeMap<u32, u32>,
    /// Round 25 — host-side bookkeeping for COM objects the
    /// guest has handed back to the test harness (live class
    /// factories + IBaseFilter pointers etc).  See
    /// [`crate::com::ComObjectTable`] for the data layout.  Each
    /// `ole32!CoCreateInstance` and every test-side
    /// `query_interface` / `add_ref` / `release` updates this
    /// table so a missing `Release` surfaces as a non-zero
    /// `total_refcount()` at end-of-test.
    pub com: crate::com::ComObjectTable,
    /// Round 55 — PRNG state for `msvcrt!rand` calls from
    /// sandboxed codec code.  Default `1` matches MSVC's
    /// documented "no `srand` called yet" initial state.
    /// Updated by both `msvcrt!srand(seed)` (from guest code)
    /// and by `Sandbox::set_rand_seed` / `with_rand_seed` (from
    /// host code) — they share the same field so the host can
    /// observe what the codec did to the state, and the codec
    /// can override the host-staged seed via its own `srand`.
    /// LCG step (Knuth-style, mod 2^32, output bits 30..16
    /// masked to 15 bits per MSVC's documented contract):
    ///
    /// ```text
    /// state = state * 214013 + 2531011  (mod 2^32)
    /// rand  = (state >> 16) & 0x7FFF
    /// ```
    pub rand_state: u32,
    /// Cursor for the next `TlsAlloc` slot. Slot indices are
    /// process-scoped (each `TlsAlloc` mints a fresh integer)
    /// but the *values* stored at those indices live in
    /// per-thread [`ThreadState::tls_slots`]. Phase 2 of the
    /// scheduler refactor.
    pub next_tls_slot: u32,
    /// Bottom of the thread-stack pool. `CreateThread` carves
    /// stacks from `[bottom, top)` walking down from
    /// `next_thread_stack_top`. Both are `0` when no pool has
    /// been configured — `CreateThread` reports an
    /// `InvalidArgument` error in that case.
    pub thread_stack_pool_bottom: u32,
    /// Next available stack-top for the next `CreateThread`.
    /// Decrements by [`THREAD_STACK_SIZE`] per spawned thread.
    pub next_thread_stack_top: u32,
    /// Bottom of the per-thread TIB (Thread Information Block)
    /// pool. `CreateThread` carves a 4 KiB TIB region per
    /// spawned thread. Phase 6 of the scheduler refactor.
    pub tib_pool_bottom: u32,
    /// Next available TIB base. Increments by
    /// [`THREAD_TIB_SIZE`] per spawned thread.
    pub next_tib_addr: u32,
}

/// Per-thread TIB size, in bytes. Real Windows uses a much
/// larger TEB (~4 KiB minimum); we only need enough room for
/// the handful of fields installer / codec CRTs actually
/// touch (SEH chain head at 0x00, self pointer at 0x18,
/// LastError at 0x34, …).
pub const THREAD_TIB_SIZE: u32 = 0x0000_1000;

/// Stride between consecutive child PE image bases. 256 MiB
/// per child gives plenty of room for sections + heap + stack
/// + TIB without colliding with adjacent processes.
pub const CHILD_IMAGE_STRIDE: u32 = 0x1000_0000;

/// Per-child-process heap arena size. Each spawned child
/// carves this much from the host's child-heap pool.
pub const CHILD_HEAP_SIZE: u32 = 0x0100_0000; // 16 MiB

/// Default per-thread stack size, in bytes. 64 KiB matches the
/// typical Win32 reserve size; many codec / installer threads
/// use only a few hundred bytes.
pub const THREAD_STACK_SIZE: u32 = 0x0001_0000;

impl ProcessState {
    /// Construct a fresh process with the heap arena at
    /// `[heap_start, heap_end)` (caller is responsible for
    /// mapping that region in the MMU as R+W).
    #[must_use]
    pub fn new(heap_start: u32, heap_end: u32) -> Self {
        ProcessState {
            pid: 1,
            heap_cursor: heap_start,
            heap_arena_end: heap_end,
            process_heap_handle: 0xDEAD_BEEF,
            next_hic: 1,
            rand_state: 1,
            ..ProcessState::default()
        }
    }
}

/// Per-thread emulator state. Phase 2 of the scheduler refactor:
/// the live `Cpu` in [`crate::Sandbox`] continues to drive
/// execution, but every thread-local Win32 surface (TLS slots,
/// priority, parked CPU register file from past quanta) lives
/// here. Phase 3 will swap the live `Cpu` value with
/// `ThreadState::parked_cpu` on context switch.
///
/// The `parked_cpu` field is `None` for the currently-running
/// thread (its register file lives on `Sandbox::cpu`) and
/// `Some(cpu)` for any thread that has been suspended,
/// preempted, or is waiting on a synchronization object.
pub struct ThreadState {
    /// Synthetic thread identifier. The first thread is `1`;
    /// `CreateThread` mints monotonically increasing values.
    pub tid: u32,
    /// Owning process. For Phase 2 every thread maps to
    /// process `1`.
    pub pid: u32,
    /// Windows thread priority axis. Default is
    /// `THREAD_PRIORITY_NORMAL = 0`. Range `-15..15` for the
    /// realtime / idle extremes.
    pub priority: i32,
    /// Map of TLS slot → value, set by `TlsSetValue` and read
    /// by `TlsGetValue`. Slot indices come from
    /// [`ProcessState::next_tls_slot`].
    pub tls_slots: BTreeMap<u32, u32>,
    /// Parked register file. Phase 3 will populate this when
    /// the scheduler swaps out the live `Cpu`.
    pub parked_cpu: Option<Cpu>,
    /// Quantum remaining for the scheduler's current slice.
    /// Phase 4 will tick this down; for now it's a placeholder
    /// initialised to the default quantum.
    pub quantum_remaining: u32,
    /// Lifecycle state — driven by the scheduler.
    pub status: crate::sched::ThreadStatus,
    /// Active wait, if `status == Waiting`. Cleared on wake.
    pub wait: Option<crate::sched::WaitCondition>,
    /// Per-thread TIB (Thread Information Block) base — guest
    /// VA the thread's CPU references via FS:[0]. `0` for the
    /// bootstrap thread (which uses the runtime's shared
    /// `TEB_BASE`); `CreateThread` carves a fresh page out of
    /// the per-process TIB pool for each new thread.
    pub tib_addr: u32,
}

impl Default for ThreadState {
    fn default() -> Self {
        ThreadState {
            tid: 0,
            pid: 0,
            priority: 0,
            tls_slots: BTreeMap::new(),
            parked_cpu: None,
            quantum_remaining: DEFAULT_QUANTUM,
            status: crate::sched::ThreadStatus::Ready,
            wait: None,
            tib_addr: 0,
        }
    }
}

impl ThreadState {
    /// Construct a fresh thread bound to the given process.
    #[must_use]
    pub fn new(tid: u32, pid: u32) -> Self {
        ThreadState {
            tid,
            pid,
            ..ThreadState::default()
        }
    }
}

/// Default scheduler quantum, in guest instructions. Phase 4
/// will start consulting this; until then the value is purely
/// informational so `ThreadState::quantum_remaining` has a
/// sensible initial value.
pub const DEFAULT_QUANTUM: u32 = 10_000;

/// The host-side state every stub may read or mutate.
///
/// This is the "operating system" of the sandbox — the heap, the
/// LastError TLS, the pseudo-tick counter, the loaded-module
/// registry, etc. One per emulator instance.
///
/// `HostState` is the union of (a) **truly shared** state
/// (virtual filesystem, virtual registry, trace buffers, clock,
/// instruction budget) and (b) the **current process**'s
/// [`ProcessState`], exposed through [`std::ops::Deref`] so that
/// existing stub bodies that read `state.heap` / `state.modules`
/// / etc. continue to compile unchanged. The split prepares
/// Phase 5 of the scheduler refactor (`CreateProcessA` spawning
/// a real child PE) without churning every Win32 stub today.
pub struct HostState {
    /// Process table keyed by PID. The bootstrap process has
    /// `pid = 1`; `CreateProcessA` mints children with
    /// monotonically increasing PIDs. The
    /// [`std::ops::Deref`] / [`std::ops::DerefMut`] impls on
    /// `HostState` resolve `state.heap` / `state.modules` /
    /// etc. through the *active* process, so stub bodies
    /// continue to compile unchanged.
    pub processes: BTreeMap<u32, ProcessState>,
    /// PID of the currently-running process — points into
    /// [`Self::processes`]. The scheduler updates this when
    /// switching threads across process boundaries.
    pub active_pid: u32,
    /// Cursor for the next `CreateProcessA` to mint a PID.
    pub next_pid: u32,
    /// Image base for the next child PE loaded via
    /// `CreateProcessA`. `0` until a child-image arena is
    /// configured (via [`Self::with_child_image_arena`]); the
    /// runtime walks the cursor forward by [`CHILD_IMAGE_STRIDE`]
    /// per spawn.
    pub next_child_image_base: u32,
    /// Heap-arena pool the next child process carves its
    /// per-process heap from. `[next_child_heap_base,
    /// child_heap_arena_end)`. Each spawn takes
    /// [`CHILD_HEAP_SIZE`] bytes.
    pub next_child_heap_base: u32,
    pub child_heap_arena_end: u32,
    /// Thread table keyed by TID. Phase 2 of the scheduler
    /// refactor: there is always at least one thread, with
    /// `tid = 1`, owning the live `Cpu` on `Sandbox`. Phase 3
    /// will populate more entries when `CreateThread` mints a
    /// real thread, and the scheduler will move the live `Cpu`
    /// in/out of `parked_cpu` here on context switch.
    pub threads: BTreeMap<u32, ThreadState>,
    /// TID of the currently-executing thread — points into
    /// [`Self::threads`]. Phase 3 will mutate this on context
    /// switch.
    pub active_tid: u32,
    /// Cursor for the next `CreateThread` to mint a TID. Stays
    /// monotonic across the lifetime of the sandbox.
    pub next_tid: u32,
    /// Last error code (`SetLastError` / `GetLastError`). Phase
    /// 6 will mirror this through the per-thread TIB at
    /// FS:[0x34] so guest code reading it directly sees the
    /// per-thread value. For now (single thread) the field on
    /// HostState is the source of truth.
    pub last_error: u32,
    /// Lazily-allocated guest address of the C-CRT `errno` cell.
    /// `None` until the first call to `msvcrt::_errno`, then
    /// stable for the lifetime of the sandbox so repeated calls
    /// return the same pointer (the contract `int * _errno(void)`
    /// requires).
    pub errno_cell: Option<u32>,
    /// Pseudo-tick counter incremented on every `GetTickCount`.
    pub tick: u32,
    /// Lines that the codec wrote to `OutputDebugString*`. Tests
    /// can introspect to confirm a known string was emitted.
    pub debug_log: Vec<String>,
    /// Lines that the codec wrote to `MessageBoxA` (also mirrored
    /// to `eprintln!`). Distinct from `debug_log` so a test can
    /// distinguish OutputDebugStringA traffic from real popups.
    pub message_box_log: Vec<String>,
    /// Optional per-run instruction budget. Decremented at each
    /// top-of-loop iteration in [`run_until_sentinel`] (both
    /// instruction steps and stub dispatches count). When it
    /// hits zero the run loop bails with
    /// [`Win32Error::BudgetExhausted`] so adversarial guests
    /// can't loop the host. `None` (the default) keeps the
    /// historical unbounded behaviour.
    pub instruction_budget: Option<u64>,
    /// Counts how many instructions actually ran in the last
    /// (or current) run-loop session. Useful for the analysis
    /// front-ends to report "ran for N instructions, budget
    /// was M". Reset to zero on each top-level run entry.
    pub instructions_executed: u64,
    /// Optional emulation-context layer (virtual filesystem,
    /// virtual registry, future surfaces). When `None`, the
    /// Win32 stubs that would consult it fall through to
    /// their fail-soft default. See [`crate::context::Context`]
    /// for the contract.
    pub context: crate::context::Context,
    /// When `true`, [`dispatch_stub`] appends one line per Win32
    /// call to [`HostState::stub_trace`]. Off by default; round-8 tests flip
    /// it on while triaging which stub returns a bad value.
    pub trace_stubs: bool,
    /// Per-call trace lines populated when [`HostState::trace_stubs`] is on.
    /// Format: `dll!name(arg0, arg1, …) → 0xRET`. The args are
    /// the first `arg_dwords` (or, for known cdecl shapes, the
    /// override from [`cdecl_trace_arg_count`]) dwords off the
    /// guest stack, captured BEFORE the stub mutates them.
    pub stub_trace: Vec<String>,
    /// Structured per-call log, populated when [`HostState::trace_stubs`]
    /// is on. Parallel to [`HostState::stub_trace`]; analysis front-ends
    /// (the `ud analyze` JSON output) consume this directly so
    /// they don't have to re-parse the formatted string.
    pub stub_calls: Vec<StubCall>,
    /// Scheduler-owned wait-object table + global instruction
    /// clock. Phase 3 of the scheduler refactor.
    pub scheduler: crate::sched::Scheduler,
    /// When `Some`, the most recently dispatched stub asked the
    /// run loop to switch threads. The run loop drains the
    /// field after every stub return: a `Wait(...)` moves the
    /// current thread to `Waiting`; `Yield` re-queues it at the
    /// end of the Ready queue; `Exit { code }` terminates it.
    pub yield_requested: Option<crate::sched::YieldRequest>,
}

impl Default for HostState {
    fn default() -> Self {
        let mut threads = BTreeMap::new();
        threads.insert(1, ThreadState::new(1, 1));
        let mut processes = BTreeMap::new();
        let mut p = ProcessState::default();
        p.pid = 1;
        processes.insert(1, p);
        HostState {
            processes,
            active_pid: 1,
            next_pid: 2,
            next_child_image_base: 0,
            next_child_heap_base: 0,
            child_heap_arena_end: 0,
            threads,
            active_tid: 1,
            next_tid: 2,
            last_error: 0,
            errno_cell: None,
            tick: 0,
            debug_log: Vec::new(),
            message_box_log: Vec::new(),
            instruction_budget: None,
            instructions_executed: 0,
            context: crate::context::Context::default(),
            trace_stubs: false,
            stub_trace: Vec::new(),
            stub_calls: Vec::new(),
            scheduler: crate::sched::Scheduler::new(),
            yield_requested: None,
        }
    }
}

impl std::ops::Deref for HostState {
    type Target = ProcessState;
    fn deref(&self) -> &ProcessState {
        self.processes
            .get(&self.active_pid)
            .expect("active_pid must always point to a live process")
    }
}

impl std::ops::DerefMut for HostState {
    fn deref_mut(&mut self) -> &mut ProcessState {
        self.processes
            .get_mut(&self.active_pid)
            .expect("active_pid must always point to a live process")
    }
}

impl HostState {
    /// Construct a HostState with the heap arena at `[heap_start,
    /// heap_end)` (caller is responsible for mapping that region
    /// in the MMU as R+W).
    ///
    /// The const-arena (used for canned strings handed back from
    /// `GetCommandLineA` / `GetEnvironmentStrings` / etc.) is
    /// **not** allocated here — call [`Self::with_const_arena`]
    /// to set it up if those stubs are exercised. Tests that
    /// don't use them can leave it at zero.
    pub fn new(heap_start: u32, heap_end: u32) -> Self {
        let mut s = HostState::default();
        s.processes
            .insert(1, ProcessState::new(heap_start, heap_end));
        s
    }

    /// Borrow the active process. Resolved through
    /// [`Self::active_pid`].
    #[must_use]
    pub fn cur_process(&self) -> &ProcessState {
        self.processes
            .get(&self.active_pid)
            .expect("active_pid must always point to a live process")
    }

    /// Mutable borrow of the active process. Pair to
    /// [`Self::cur_process`]; same invariant.
    pub fn cur_process_mut(&mut self) -> &mut ProcessState {
        self.processes
            .get_mut(&self.active_pid)
            .expect("active_pid must always point to a live process")
    }

    /// Borrow a process by PID, if it exists.
    #[must_use]
    pub fn process(&self, pid: u32) -> Option<&ProcessState> {
        self.processes.get(&pid)
    }

    /// Mutable borrow of a process by PID.
    pub fn process_mut(&mut self, pid: u32) -> Option<&mut ProcessState> {
        self.processes.get_mut(&pid)
    }

    /// Borrow the currently-running thread. Falls back to the
    /// bootstrap thread (`tid = 1`) on the freshly-constructed
    /// state.
    #[must_use]
    pub fn cur_thread(&self) -> &ThreadState {
        self.threads
            .get(&self.active_tid)
            .expect("active_tid must always point to a live thread (Default initialises tid 1)")
    }

    /// Mutable borrow of the currently-running thread. Pair to
    /// [`Self::cur_thread`]; same invariant.
    pub fn cur_thread_mut(&mut self) -> &mut ThreadState {
        self.threads
            .get_mut(&self.active_tid)
            .expect("active_tid must always point to a live thread (Default initialises tid 1)")
    }

    /// Configure the const-arena (region for canned read-only
    /// strings handed back to the codec). `[start, end)` is a
    /// guest-virtual range the caller has already mapped R+W
    /// (the arena bytes are written via `write_initializer`,
    /// so any page perms suffice as long as the page is mapped).
    pub fn with_const_arena(mut self, start: u32, end: u32) -> Self {
        let p = self.cur_process_mut();
        p.const_arena_cursor = start;
        p.const_arena_end = end;
        self
    }

    /// Configure the thread-stack pool. `CreateThread` carves
    /// per-thread stacks from the top of this region walking
    /// downward. `[bottom, top)` must already be mapped R+W in
    /// the MMU.
    pub fn with_thread_stack_pool(mut self, bottom: u32, top: u32) -> Self {
        let p = self.cur_process_mut();
        p.thread_stack_pool_bottom = bottom;
        p.next_thread_stack_top = top;
        self
    }

    /// Configure the per-thread TIB pool. `CreateThread` carves
    /// 4 KiB TIB regions out of `[bottom, top)` walking upward.
    /// Both ends must already be mapped R+W in the MMU. The
    /// bootstrap thread continues to use the runtime's shared
    /// `TEB_BASE`; only spawned threads consume this pool.
    pub fn with_tib_pool(mut self, bottom: u32, top: u32) -> Self {
        let p = self.cur_process_mut();
        p.tib_pool_bottom = bottom;
        p.next_tib_addr = bottom;
        let _ = top; // explicit upper bound is informational
        self
    }

    /// Configure the child-process pools: image-base cursor
    /// + heap arena. `CreateProcessA` carves a child PE into
    /// `[image_base, image_base + CHILD_IMAGE_STRIDE)` and a
    /// 16 MiB heap out of `[heap_start, heap_end)`.
    pub fn with_child_arena(
        mut self,
        image_base_cursor: u32,
        heap_start: u32,
        heap_end: u32,
    ) -> Self {
        self.next_child_image_base = image_base_cursor;
        self.next_child_heap_base = heap_start;
        self.child_heap_arena_end = heap_end;
        self
    }

    /// Bump-allocate `n` bytes in the const arena. Returns the
    /// guest address of the new slab. The caller is responsible
    /// for [`Mmu::write_initializer`]'ing the contents.
    pub fn arena_const_alloc(&mut self, n: u32) -> Result<u32, Win32Error> {
        let aligned =
            n.checked_add(15)
                .map(|v| v & !15u32)
                .ok_or_else(|| Win32Error::InvalidArgument {
                    stub: "arena_const_alloc",
                    reason: format!("size overflow: requested {n} (≈ {n:#x})"),
                })?;
        let addr = self.const_arena_cursor;
        let next = addr
            .checked_add(aligned)
            .ok_or(Win32Error::InvalidArgument {
                stub: "arena_const_alloc",
                reason: "const arena address-space overflow".into(),
            })?;
        if next > self.const_arena_end {
            return Err(Win32Error::InvalidArgument {
                stub: "arena_const_alloc",
                reason: format!(
                    "const arena exhausted (need {n}, have {})",
                    self.const_arena_end - addr
                ),
            });
        }
        self.const_arena_cursor = next;
        Ok(addr)
    }

    /// Allocate a fresh slab in the heap arena and return its
    /// guest address. Used by the round-2 marshalling helpers to
    /// stage `ICDECOMPRESS` / `BITMAPINFOHEADER` / raw-frame
    /// buffers in guest memory before calling `DriverProc`.
    pub fn arena_alloc(&mut self, n: u32) -> Result<u32, Win32Error> {
        let aligned =
            n.checked_add(15)
                .map(|v| v & !15u32)
                .ok_or_else(|| Win32Error::InvalidArgument {
                    stub: "arena_alloc",
                    reason: format!("size overflow: requested {n} (≈ {n:#x})"),
                })?;
        let addr = self.heap_cursor;
        let next = addr
            .checked_add(aligned)
            .ok_or(Win32Error::InvalidArgument {
                stub: "arena_alloc",
                reason: "heap address-space overflow".into(),
            })?;
        if next > self.heap_arena_end {
            return Err(Win32Error::InvalidArgument {
                stub: "arena_alloc",
                reason: format!(
                    "arena exhausted (need {n}, have {})",
                    self.heap_arena_end - addr
                ),
            });
        }
        self.heap_cursor = next;
        self.heap.insert(addr, vec![0u8; n as usize]);
        Ok(addr)
    }
}

/// Stub registry. Created once per emulator instance.
#[derive(Default)]
pub struct Registry {
    by_thunk: BTreeMap<u32, StubEntry>,
    by_name: BTreeMap<(String, String), u32>,
    next_slot: u32,
    /// Per-(dll, name) **data imports**. Some CRT symbols are
    /// imported by name but are read as data (e.g.
    /// `msvcrt!_adjust_fdiv`, an `int` flag the FDIV-erratum
    /// fix-up code consults). The PE loader treats their IAT
    /// slots as `mov ecx, [iat]; mov edx, [ecx]` — the IAT
    /// slot is the address OF a 4-byte int, not a function
    /// pointer. We pre-allocate a small read/write region for
    /// these and patch the IAT slot to its address. The
    /// `(value)` is whatever the symbol is documented to hold;
    /// 0 is the safe default.
    data_imports: BTreeMap<(String, String), DataImport>,
    /// Bump cursor in the data-import slot region (assigned
    /// addresses live in `[DATA_IMPORT_BASE, DATA_IMPORT_BASE +
    /// DATA_IMPORT_SIZE)`).
    next_data_slot: u32,
}

/// One data-import slot, addressed via [`Registry::resolve`].
#[derive(Clone, Copy, Debug)]
pub struct DataImport {
    /// Guest address of the 4-byte slot. The PE loader patches
    /// the IAT entry with this value.
    pub addr: u32,
    /// Initial value to seed into `[addr]` at first slot
    /// allocation. Subsequent registrations of the same name
    /// keep the prior value.
    pub initial: u32,
}

/// Region reserved for data-import slots — see [`DataImport`].
/// 4 KiB is plenty: the entire CRT data-import set is fewer
/// than 16 dwords across all codecs we expect to load.
pub const DATA_IMPORT_BASE: u32 = 0x7010_0000;
const DATA_IMPORT_SIZE: u32 = 0x0000_1000;
const DATA_IMPORT_END: u32 = DATA_IMPORT_BASE + DATA_IMPORT_SIZE;

impl Registry {
    pub fn new() -> Self {
        Registry {
            by_thunk: BTreeMap::new(),
            by_name: BTreeMap::new(),
            next_slot: 0,
            data_imports: BTreeMap::new(),
            next_data_slot: DATA_IMPORT_BASE,
        }
    }

    /// Register a data import — a 4-byte symbol the codec
    /// reads via `mov reg, [iat]; mov reg, [reg]`. Returns
    /// the guest address that the IAT slot should point at.
    /// Subsequent calls with the same `(dll, name)` return the
    /// previously assigned slot.
    pub fn register_data(&mut self, dll: &str, name: &str, initial: u32) -> u32 {
        let key = (dll.to_ascii_lowercase(), name.to_string());
        if let Some(d) = self.data_imports.get(&key) {
            return d.addr;
        }
        let addr = self.next_data_slot;
        let next = addr.saturating_add(4);
        if next > DATA_IMPORT_END {
            // Caller asked to register more data imports than
            // we reserved space for. Return 0 — the loader
            // handles "unresolved" by falling back to a thunk
            // that will trap loudly.
            return 0;
        }
        self.next_data_slot = next;
        self.data_imports.insert(key, DataImport { addr, initial });
        // Also expose it through the by-name resolver so the
        // PE loader's ordinary lookup picks it up. The
        // returned address is in the data region (not a thunk
        // — `is_thunk(addr)` will correctly return false).
        self.by_name
            .insert((dll.to_ascii_lowercase(), name.to_string()), addr);
        addr
    }

    /// Iterate the registered data imports. The PE loader uses
    /// this to seed each slot's `initial` value into MMU memory
    /// after the data-import region has been mapped.
    pub fn data_imports(&self) -> impl Iterator<Item = (&String, &String, &DataImport)> {
        self.data_imports
            .iter()
            .map(|((dll, name), d)| (dll, name, d))
    }

    /// Register a stub. Returns the synthetic thunk address that
    /// the IAT slot should be populated with.
    pub fn register(&mut self, dll: &str, name: &str, func: StubFn, arg_dwords: u32) -> u32 {
        let key = (dll.to_ascii_lowercase(), name.to_string());
        if let Some(addr) = self.by_name.get(&key) {
            return *addr;
        }
        let thunk_addr = THUNK_BASE.wrapping_add(self.next_slot.wrapping_mul(THUNK_STRIDE));
        self.next_slot += 1;
        self.by_name.insert(key.clone(), thunk_addr);
        self.by_thunk.insert(
            thunk_addr,
            StubEntry {
                dll: key.0,
                name: key.1,
                func,
                arg_dwords,
                thunk_addr,
            },
        );
        thunk_addr
    }

    /// Resolve an import. The PE loader uses this when populating
    /// IAT slots. `dll_name` is matched case-insensitively.
    pub fn resolve(&self, dll: &str, name: &str) -> Option<u32> {
        let key = (dll.to_ascii_lowercase(), name.to_string());
        self.by_name.get(&key).copied()
    }

    /// Register a fail-soft fallback thunk for an import we
    /// don't have a stub for. The thunk's stub function looks
    /// itself up in the registry and raises
    /// [`crate::emulator::Trap::UnresolvedImport`] carrying
    /// the (dll, name) pair on first call.
    ///
    /// The PE loader's fail-soft mode installs one of these
    /// for every unresolved IAT entry so loading succeeds and
    /// execution proceeds until the first unknown API actually
    /// gets called. That's a much better signal than failing
    /// at load time: the trap names the specific function to
    /// implement next, and reveals which import paths are
    /// reachable from the entry point.
    pub fn register_unknown_fallback(&mut self, dll: &str, name: &str) -> u32 {
        let key = (dll.to_ascii_lowercase(), name.to_string());
        if let Some(addr) = self.by_name.get(&key) {
            return *addr;
        }
        let thunk_addr = THUNK_BASE.wrapping_add(self.next_slot.wrapping_mul(THUNK_STRIDE));
        self.next_slot += 1;
        self.by_name.insert(key.clone(), thunk_addr);
        // arg_dwords=0 is wrong for most stdcall APIs but
        // doesn't matter — the stub traps before returning so
        // dispatch_stub never reaches the stack-cleanup path.
        self.by_thunk.insert(
            thunk_addr,
            StubEntry {
                dll: key.0,
                name: key.1,
                func: stub_unresolved_fallback,
                arg_dwords: 0,
                thunk_addr,
            },
        );
        thunk_addr
    }

    /// True iff `addr` is a registered thunk address.
    pub fn is_thunk(&self, addr: u32) -> bool {
        self.by_thunk.contains_key(&addr)
    }

    /// Look up the stub entry by its thunk address. Used by the
    /// runtime when it sees `eip == thunk_addr`.
    pub fn entry(&self, addr: u32) -> Option<&StubEntry> {
        self.by_thunk.get(&addr)
    }

    /// Convenience: register every kernel32 stub. Returns the
    /// number of stubs registered.
    pub fn register_kernel32(&mut self) -> usize {
        let before = self.by_name.len();
        kernel32::register(self);
        self.by_name.len() - before
    }

    /// Register every gdi32 stub. Returns the number registered.
    pub fn register_gdi32(&mut self) -> usize {
        let before = self.by_name.len();
        gdi32::register(self);
        self.by_name.len() - before
    }

    /// Register every user32 stub. Returns the number registered.
    pub fn register_user32(&mut self) -> usize {
        let before = self.by_name.len();
        user32::register(self);
        self.by_name.len() - before
    }

    /// Register every winmm stub. Returns the number registered.
    pub fn register_winmm(&mut self) -> usize {
        let before = self.by_name.len();
        winmm::register(self);
        self.by_name.len() - before
    }

    /// Register every advapi32 stub. Returns the number registered.
    pub fn register_advapi32(&mut self) -> usize {
        let before = self.by_name.len();
        advapi32::register(self);
        self.by_name.len() - before
    }

    /// Register every ole32 stub. Returns the number registered.
    pub fn register_ole32(&mut self) -> usize {
        let before = self.by_name.len();
        ole32::register(self);
        self.by_name.len() - before
    }

    /// Register every msvcrt stub. Returns the number registered.
    pub fn register_msvcrt(&mut self) -> usize {
        let before = self.by_name.len();
        msvcrt::register(self);
        self.by_name.len() - before
    }

    /// Register the msvcrt stub set under `msvcr71.dll`. Used by
    /// codecs from the wmfdist11 era (mp43decd, mp4sdecd,
    /// wmvdecod, …) that link MSVC 7.1's runtime by its
    /// per-version name. Returns the number registered.
    pub fn register_msvcr71(&mut self) -> usize {
        let before = self.by_name.len();
        msvcrt::register_alias(self, "msvcr71.dll");
        self.by_name.len() - before
    }

    /// Register the msvcrt stub set under `pncrt.dll`. Used by
    /// RealNetworks codecs that ship their own CRT fork.
    /// Returns the number registered.
    pub fn register_pncrt(&mut self) -> usize {
        let before = self.by_name.len();
        msvcrt::register_alias(self, "pncrt.dll");
        self.by_name.len() - before
    }

    /// Register the msvcrt stub set under `msvcr80.dll` (Visual
    /// Studio 2005 CRT). Used by `camstudio-1.4-camcodec.dll`.
    pub fn register_msvcr80(&mut self) -> usize {
        let before = self.by_name.len();
        msvcrt::register_alias(self, "msvcr80.dll");
        self.by_name.len() - before
    }

    /// Register the msvcrt stub set under `msvcr90.dll` (Visual
    /// Studio 2008 CRT). Used by `camstudio-1.5-camcodec.dll`.
    pub fn register_msvcr90(&mut self) -> usize {
        let before = self.by_name.len();
        msvcrt::register_alias(self, "msvcr90.dll");
        self.by_name.len() - before
    }

    /// Register every mfplat (Media Foundation platform) stub.
    /// Returns the number registered.
    pub fn register_mfplat(&mut self) -> usize {
        let before = self.by_name.len();
        mfplat::register(self);
        self.by_name.len() - before
    }

    /// Register every msi.dll stub — Windows Installer surface
    /// touched by application installers (QuickTime, …).
    /// Returns the number registered.
    pub fn register_msi(&mut self) -> usize {
        let before = self.by_name.len();
        msi::register(self);
        self.by_name.len() - before
    }

    /// Register the version.dll / comctl32.dll / shell32.dll /
    /// shlwapi.dll stub families — the config-dialog and
    /// settings-file surface VfW codecs pull in alongside their
    /// decode core. Returns the number registered.
    pub fn register_shell_support(&mut self) -> usize {
        let before = self.by_name.len();
        version::register(self);
        comctl32::register(self);
        shell32::register(self);
        shlwapi::register(self);
        self.by_name.len() - before
    }

    /// Register every Round-1+4+8+20 stub family in one call:
    /// kernel32, gdi32, user32, winmm, advapi32, ole32, msvcrt,
    /// plus the round-27 host-COM thunk family used by
    /// [`crate::com::mint_host_filter_graph`].  Returns the total
    /// number registered.
    pub fn register_all(&mut self) -> usize {
        let host_before = self.by_name.len();
        crate::com::host_iface::register(self);
        crate::com::host_iface_r31::register(self);
        let host_count = self.by_name.len() - host_before;
        self.register_kernel32()
            + self.register_gdi32()
            + self.register_user32()
            + self.register_winmm()
            + self.register_advapi32()
            + self.register_ole32()
            + self.register_msvcrt()
            + self.register_msvcr71()
            + self.register_pncrt()
            + self.register_msvcr80()
            + self.register_msvcr90()
            + self.register_mfplat()
            + self.register_msi()
            + self.register_shell_support()
            + host_count
    }
}

/// Read the `n`-th stdcall dword argument off the guest stack.
///
/// At entry, `esp` points to the saved return address (pushed by
/// the caller's CALL); the first argument is at `esp+4`, the
/// second at `esp+8`, etc.
pub fn arg_dword(cpu: &Cpu, mmu: &Mmu, n: u32) -> Result<u32, crate::emulator::Trap> {
    let addr = cpu.regs.esp().wrapping_add(4u32 * (n + 1));
    mmu.load32(addr)
}

/// Cdecl arg-count override table for trace-event extraction.
///
/// Stdcall stubs already declare their argument count in
/// [`StubEntry::arg_dwords`] (the value the dispatch site uses
/// to pop the stack on return). Cdecl stubs declare `0` because
/// the *caller* cleans the stack — but the args are still on the
/// stack at call entry. For known-shape cdecl entries we return
/// the per-call dword count so the trace probe can read those
/// dwords back into `args[]` on `kind=win32_call` events.
///
/// Returns `None` if the `(dll, name)` pair has no override; in
/// that case the trace site falls back to the registered
/// `arg_dwords` (0 for any cdecl stub, leaving `args:[]` as
/// before — so this is purely additive).
///
/// Reference: `docs/video/msmpeg4/audit/06-sandbox-O3-quant-init.md`
/// §5.2.3 — Auditor needs allocation sizes surfaced at call
/// time so the codec-context allocation can be located by size
/// match rather than by return-address differencing.
pub fn cdecl_trace_arg_count(dll: &str, name: &str) -> Option<u32> {
    match (dll, name) {
        // Heap surface — single-arg shapes.
        //   void* malloc(size_t)                              — 1
        //   void  free(void*)                                 — 1
        //   void* operator new(unsigned int)  ??2@YAPAXI@Z    — 1
        //   void  operator delete(void*)      ??3@YAXPAX@Z    — 1
        ("msvcrt.dll", "malloc")
        | ("msvcrt.dll", "free")
        | ("msvcrt.dll", "??2@YAPAXI@Z")
        | ("msvcrt.dll", "??3@YAXPAX@Z") => Some(1),
        // Two-arg shapes — not registered today but cheap to
        // pre-declare so a future `register("msvcrt.dll",
        // "calloc"/"realloc", ...)` automatically gets traced
        // args without revisiting this table.
        //   void* calloc(size_t count, size_t size)           — 2
        //   void* realloc(void*, size_t)                      — 2
        ("msvcrt.dll", "calloc") | ("msvcrt.dll", "realloc") => Some(2),
        _ => None,
    }
}

/// Convert an MMU/CPU [`crate::emulator::Trap`] into a [`Win32Error`]
/// so a stub's argument-fetch failure surfaces as
/// `Win32Error::InvalidArgument`. Used by the gdi32 / user32 /
/// winmm modules.
pub fn trap_to_win32_local(stub: &'static str, t: crate::emulator::Trap) -> Win32Error {
    Win32Error::InvalidArgument {
        stub,
        reason: format!("{t}"),
    }
}

/// Read a NUL-terminated 8-bit string from guest memory at `addr`,
/// stopping at NUL or after `max` bytes. Used by user32/winmm
/// stubs that take an `LPCSTR`.
pub fn read_cstr_local(mmu: &Mmu, mut addr: u32, max: u32) -> Result<String, Win32Error> {
    let mut bytes = Vec::new();
    for _ in 0..max {
        let b = mmu
            .load8(addr)
            .map_err(|t| trap_to_win32_local("read_cstr", t))?;
        if b == 0 {
            break;
        }
        bytes.push(b);
        addr = addr.wrapping_add(1);
    }
    Ok(String::from_utf8_lossy(&bytes).into_owned())
}

/// Stub function used by [`Registry::register_unknown_fallback`].
/// Looks up its own (dll, name) by reverse-resolving the entry
/// EIP against the registry and raises a
/// [`Win32Error::UnknownImport`] that the runtime surfaces as
/// `Trap::UnresolvedImport`. Execution halts on first call —
/// the operator sees the precise import to implement next.
fn stub_unresolved_fallback(
    cpu: &mut Cpu,
    _mmu: &mut Mmu,
    _state: &mut HostState,
    registry: &Registry,
) -> Result<u32, Win32Error> {
    let addr = cpu.regs.eip;
    let (dll, name) = registry
        .entry(addr)
        .map(|e| (e.dll.clone(), e.name.clone()))
        .unwrap_or_else(|| ("<unknown>".to_string(), format!("@{addr:#010x}")));
    Err(Win32Error::UnknownImport { dll, name })
}

/// Dispatch a stub call. The runtime wires this into the executor
/// so that whenever `eip` lands on a thunk address, control
/// transfers here instead of fetching instruction bytes.
///
/// On entry: the guest CALL has already pushed the return
/// address; `eip` is the thunk address. On exit: `eax` holds the
/// stub's return value, `eip` is the popped return address, and
/// `arg_dwords*4` bytes have been removed from the stack
/// (stdcall callee-cleanup).
pub fn dispatch_stub(
    cpu: &mut Cpu,
    mmu: &mut Mmu,
    registry: &Registry,
    state: &mut HostState,
) -> Result<(), crate::Error> {
    let addr = cpu.regs.eip;
    let entry = registry
        .entry(addr)
        .ok_or_else(|| Win32Error::UnknownImport {
            dll: "<thunk>".into(),
            name: format!("@{:#010x}", addr),
        })?
        .clone();
    // Snapshot the call-site EIP (= the saved return address
    // pushed by the guest CALL — the instruction right after
    // the CALL, not the thunk address) and the first few args
    // off the guest stack BEFORE running the stub, since the
    // stub mutates the stack.
    //
    // Argument count: `entry.arg_dwords` carries the stdcall
    // count (the value used to pop the stack on return). For
    // cdecl stubs this is 0 — but for known cdecl shapes
    // (msvcrt heap entries) [`cdecl_trace_arg_count`] supplies a
    // per-call override so the trace surfaces the size / pointer
    // args rather than `args:[]`.
    //
    // The snapshot is always-on when `state.trace_stubs` is set
    // (the structured `stub_calls` vector consumes it) and is
    // additionally emitted as a JSONL event under the `trace`
    // feature flag.
    let capture_args = state.trace_stubs;
    #[cfg(feature = "trace")]
    let capture_args = capture_args || mmu.trace.has_sink();
    let snapshot: Option<(u32, Vec<u32>)> = if capture_args {
        let call_site_eip = mmu.load32(cpu.regs.esp()).unwrap_or(0);
        let n_args = cdecl_trace_arg_count(&entry.dll, &entry.name).unwrap_or(entry.arg_dwords);
        let mut args = Vec::with_capacity(n_args as usize);
        for i in 0..n_args {
            let a = arg_dword(cpu, mmu, i).unwrap_or(0);
            args.push(a);
        }
        Some((call_site_eip, args))
    } else {
        None
    };
    // Run the host-side stub.
    let ret = (entry.func)(cpu, mmu, state, registry)?;
    if state.trace_stubs {
        let (call_site_eip, args) = snapshot.clone().unwrap_or((0, Vec::new()));
        let args_str = args
            .iter()
            .map(|a| format!("{a:#010x}"))
            .collect::<Vec<_>>()
            .join(", ");
        state.stub_trace.push(format!(
            "{}!{}({args_str}) → {:#010x}",
            entry.dll, entry.name, ret
        ));
        state.stub_calls.push(StubCall {
            dll: entry.dll.clone(),
            name: entry.name.clone(),
            args,
            ret,
            call_site_eip,
        });
    }
    // Emit the trace event with the captured args + the actual
    // return value. Done before stack unwind so the EIP we log
    // is the call site, not the post-return PC.
    #[cfg(feature = "trace")]
    if let Some((call_site_eip, args)) = snapshot {
        mmu.trace
            .ev_win32_call(&entry.dll, &entry.name, &args, ret, call_site_eip);
    }
    // stdcall: pop return address, advance esp by arg_dwords*4,
    // set eax to the return value.
    let ret_addr = cpu.pop32(mmu)?;
    cpu.regs.set32(crate::emulator::regs::Reg32::Eax, ret);
    let new_esp = cpu
        .regs
        .esp()
        .wrapping_add(entry.arg_dwords.wrapping_mul(4));
    cpu.regs.set_esp(new_esp);
    cpu.regs.eip = ret_addr;
    Ok(())
}

/// Run the emulator until `eip == RET_SENTINEL`, dispatching to
/// any Win32 stub thunk addresses encountered along the way.
///
/// This is the shared run-loop body used both by [`crate::Sandbox`]
/// and by re-entrant host stubs (notably the `vfw32` surface,
/// which dispatches the codec's `DriverProc` synchronously
/// inside an outer `IC*` call).
/// Process a yield request from a freshly-returned stub. The
/// active thread transitions to the requested scheduler state,
/// then the run loop picks the next `Ready` thread via
/// [`schedule_next_thread`]. The live `Cpu` is parked into the
/// previous active thread's `parked_cpu` slot and the new
/// thread's parked Cpu replaces it.
fn handle_yield(cpu: &mut Cpu, state: &mut HostState, req: crate::sched::YieldRequest) {
    use crate::sched::{ThreadStatus, YieldRequest};
    match req {
        YieldRequest::Wait(cond) => {
            let t = state.cur_thread_mut();
            t.status = ThreadStatus::Waiting;
            t.wait = Some(cond);
        }
        YieldRequest::Yield => {
            let t = state.cur_thread_mut();
            t.status = ThreadStatus::Ready;
        }
        YieldRequest::Exit { code } => {
            let tid = state.active_tid;
            let t = state.cur_thread_mut();
            t.status = ThreadStatus::Terminated;
            t.wait = None;
            on_thread_terminated(state, tid);
            // Signal any pending `WaitForSingleObject` against
            // this thread's Thread WaitObject. (Phase 3c will
            // implement the wake side; here we just mark the
            // state machine so Phase 3c's wake-up sees a
            // terminated thread).
            let _ = code;
        }
    }
    schedule_next_thread(cpu, state);
}

/// Park the live `Cpu` into the current thread and resume the
/// next `Ready` thread (if one exists). When no other Ready
/// thread is available, restores the current thread's CPU
/// unchanged — the run loop continues with the same thread
/// (which is fine for single-thread Sleep behaviour: the clock
/// fast-forward at the top of the loop wakes the same thread).
fn schedule_next_thread(cpu: &mut Cpu, state: &mut HostState) {
    use crate::sched::ThreadStatus;
    // Pick the next runnable thread other than the current
    // one — round-robin by TID order. Phase 4 will add
    // priority-aware picking.
    let cur_tid = state.active_tid;
    let next_tid = {
        let mut candidates: Vec<(i32, u32)> = state
            .threads
            .iter()
            .filter(|(tid, t)| **tid != cur_tid && matches!(t.status, ThreadStatus::Ready))
            .map(|(tid, t)| (t.priority, *tid))
            .collect();
        // Sort by descending priority then ascending TID for
        // deterministic round-robin within the same priority.
        candidates.sort_by(|a, b| b.0.cmp(&a.0).then(a.1.cmp(&b.1)));
        candidates.into_iter().next().map(|(_, tid)| tid)
    };
    let cur_is_runnable = matches!(
        state
            .threads
            .get(&cur_tid)
            .map(|t| t.status)
            .unwrap_or(ThreadStatus::Terminated),
        ThreadStatus::Ready | ThreadStatus::Running
    );
    let Some(next_tid) = next_tid else {
        // No other Ready thread. If the current is still
        // runnable, just keep going. Otherwise the run loop's
        // sleep-clock fast-forward will wake it; if that
        // doesn't apply we'd deadlock — but Phase 3b only
        // exposes Sleep, so this path is fine.
        if cur_is_runnable {
            state.cur_thread_mut().status = ThreadStatus::Running;
        }
        return;
    };
    // Park the live CPU into the current thread.
    let parked = std::mem::take(cpu);
    if let Some(t) = state.threads.get_mut(&cur_tid) {
        t.parked_cpu = Some(parked);
    }
    // Restore the next thread's parked CPU into the live one.
    let mut new_pid = None;
    if let Some(t) = state.threads.get_mut(&next_tid) {
        if let Some(c) = t.parked_cpu.take() {
            *cpu = c;
        }
        t.status = ThreadStatus::Running;
        new_pid = Some(t.pid);
    }
    state.active_tid = next_tid;
    // When the new thread lives in a different process, update
    // `active_pid` so the Deref-resolved per-process state
    // (heap arena, modules, hwnd registry, …) points at the
    // new process. Phase 5c.
    if let Some(pid) = new_pid {
        if state.processes.contains_key(&pid) {
            state.active_pid = pid;
        }
    }
}

/// After a thread terminates, check whether its owning
/// process has any live threads left. If not, record the
/// process's exit code (defaulting to 0 if not already set)
/// and wake every thread blocked on a `WaitObject::Process`
/// targeting that PID. Phase 5c — chains the natural Win32
/// "last thread out marks the process exited" contract.
fn on_thread_terminated(state: &mut HostState, tid: u32) {
    let pid = match state.threads.get(&tid) {
        Some(t) => t.pid,
        None => return,
    };
    let alive = state
        .threads
        .values()
        .any(|t| t.pid == pid && !matches!(t.status, crate::sched::ThreadStatus::Terminated));
    if alive {
        return;
    }
    if let Some(p) = state.processes.get_mut(&pid) {
        if p.exit_code.is_none() {
            p.exit_code = Some(0);
        }
    }
    // Wake every Process-handle waiter on this PID.
    let handles: Vec<u32> = state
        .scheduler
        .objects
        .iter()
        .filter_map(|(h, obj)| match obj {
            crate::sched::WaitObject::Process { pid: p } if *p == pid => Some(*h),
            _ => None,
        })
        .collect();
    for h in handles {
        for waiter_tid in crate::sched::waiters_on(&state.threads, h) {
            if let Some(t) = state.threads.get_mut(&waiter_tid) {
                t.status = crate::sched::ThreadStatus::Ready;
                t.wait = None;
            }
        }
    }
    // Same for any pending Thread-handle waits on this TID.
    let thread_handles: Vec<u32> = state
        .scheduler
        .objects
        .iter()
        .filter_map(|(h, obj)| match obj {
            crate::sched::WaitObject::Thread { tid: t } if *t == tid => Some(*h),
            _ => None,
        })
        .collect();
    for h in thread_handles {
        for waiter_tid in crate::sched::waiters_on(&state.threads, h) {
            if let Some(t) = state.threads.get_mut(&waiter_tid) {
                t.status = crate::sched::ThreadStatus::Ready;
                t.wait = None;
            }
        }
    }
}

/// Earliest `resume_after_instructions` across every
/// Sleep-waiting thread, or `None` if no thread is sleeping.
fn earliest_sleep_resume(state: &HostState) -> Option<u64> {
    state
        .threads
        .values()
        .filter_map(|t| {
            if matches!(t.status, crate::sched::ThreadStatus::Waiting) {
                if let Some(crate::sched::WaitCondition::Sleep {
                    resume_after_instructions,
                }) = t.wait
                {
                    return Some(resume_after_instructions);
                }
            }
            None
        })
        .min()
}

/// Move every `Waiting`-on-Sleep thread whose resume target is
/// in the past back to `Ready`. Called from
/// [`run_until_sentinel`] after the global clock advances.
fn wake_sleep_if_due(state: &mut HostState) {
    let now = state.scheduler.instructions_global;
    for t in state.threads.values_mut() {
        if matches!(t.status, crate::sched::ThreadStatus::Waiting) {
            if let Some(crate::sched::WaitCondition::Sleep {
                resume_after_instructions,
            }) = t.wait
            {
                if now >= resume_after_instructions {
                    t.status = crate::sched::ThreadStatus::Ready;
                    t.wait = None;
                }
            }
        }
    }
}

pub fn run_until_sentinel(
    cpu: &mut Cpu,
    mmu: &mut Mmu,
    registry: &Registry,
    state: &mut HostState,
) -> Result<(), crate::Error> {
    use crate::emulator::isa_int::{StepOk, RET_SENTINEL};
    // Reset the per-run instruction counter so analysis
    // front-ends can ask "how many did this top-level call
    // burn?" without subtracting from a stale snapshot.
    state.instructions_executed = 0;
    loop {
        // Honour any yield request the most recently dispatched
        // stub left behind. Phase 3 of the scheduler refactor:
        // a `Wait`/`Yield`/`Exit` request handed up from a stub
        // suspends the active thread and resumes the next
        // `Ready` one. Until Phase 3d ships, only `Sleep` and
        // `Yield` (single-thread) are observable here — both
        // resolve as "spin until wake-up" without an actual
        // context switch.
        if let Some(req) = state.yield_requested.take() {
            handle_yield(cpu, state, req);
        }
        // Scheduler nudge: when the active thread isn't
        // runnable (Terminated / Waiting because no other
        // Ready thread could be picked at yield time), look
        // for any thread sleeping on a Sleep wait. The
        // earliest wake target fast-forwards the global
        // clock; `wake_sleep_if_due` then moves matching
        // threads back to Ready, and `schedule_next_thread`
        // switches into one of them.
        let active_runnable = matches!(
            state.cur_thread().status,
            crate::sched::ThreadStatus::Ready | crate::sched::ThreadStatus::Running
        );
        if !active_runnable {
            if let Some(earliest) = earliest_sleep_resume(state) {
                state.scheduler.instructions_global =
                    state.scheduler.instructions_global.max(earliest);
                wake_sleep_if_due(state);
                schedule_next_thread(cpu, state);
            }
            // Active thread still not runnable AND no Ready
            // peer was found — the run is done. Return so the
            // outer host caller observes a clean exit rather
            // than a busy spin.
            if !matches!(
                state.cur_thread().status,
                crate::sched::ThreadStatus::Ready | crate::sched::ThreadStatus::Running
            ) {
                cpu.regs.eip = RET_SENTINEL;
                return Ok(());
            }
            state.cur_thread_mut().status = crate::sched::ThreadStatus::Running;
        }
        if state.exit_requested.is_some() {
            // `kernel32!ExitProcess` was called. Force eip to
            // the sentinel so the outer caller's stack-frame
            // cleanup is consistent and exit cleanly.
            cpu.regs.eip = RET_SENTINEL;
            return Ok(());
        }
        if cpu.regs.eip == RET_SENTINEL {
            // The active thread has run off the end of its
            // top-level callable. If it's the bootstrap thread
            // (TID 1), the entire run is done. Otherwise, mark
            // the thread Terminated and switch to the next
            // Ready one.
            if state.active_tid == 1 {
                return Ok(());
            }
            let dead_tid = state.active_tid;
            state.cur_thread_mut().status = crate::sched::ThreadStatus::Terminated;
            on_thread_terminated(state, dead_tid);
            schedule_next_thread(cpu, state);
            // After the switch the live CPU points at the next
            // thread; if no other was Ready, we're back on the
            // bootstrap thread and `schedule_next_thread`
            // left the live CPU untouched — so we'll re-enter
            // this branch and return.
            if state.active_tid == 1
                && matches!(
                    state.cur_thread().status,
                    crate::sched::ThreadStatus::Ready | crate::sched::ThreadStatus::Running
                )
                && cpu.regs.eip == RET_SENTINEL
            {
                return Ok(());
            }
            continue;
        }
        // Optional instruction budget — both instruction steps
        // and stub dispatches are counted as one "step" each,
        // since either is a unit of progress the host attributed
        // to the guest. When the budget hits zero, bail with a
        // clean `BudgetExhausted` so adversarial samples can't
        // loop the analyser host.
        if let Some(remaining) = state.instruction_budget.as_mut() {
            if *remaining == 0 {
                return Err(crate::Error::Win32(Win32Error::BudgetExhausted {
                    executed: state.instructions_executed,
                }));
            }
            *remaining -= 1;
        }
        state.instructions_executed = state.instructions_executed.saturating_add(1);
        state.scheduler.instructions_global = state.scheduler.instructions_global.saturating_add(1);
        // Quantum-based preemption (Phase 4). Each executed
        // instruction or stub dispatch counts against the
        // current thread's quantum. When it hits zero, ask the
        // scheduler to switch — but only when there is another
        // Ready thread to switch to, otherwise the current
        // thread just keeps the floor with a fresh quantum.
        {
            let quantum_default = state.scheduler.quantum_default;
            let cur_tid = state.active_tid;
            let t = state.cur_thread_mut();
            if t.quantum_remaining > 0 {
                t.quantum_remaining -= 1;
            }
            let exhausted = t.quantum_remaining == 0;
            if exhausted {
                t.quantum_remaining = quantum_default;
            }
            if exhausted {
                let has_peer = state.threads.iter().any(|(tid, ts)| {
                    *tid != cur_tid && matches!(ts.status, crate::sched::ThreadStatus::Ready)
                });
                if has_peer {
                    state.yield_requested = Some(crate::sched::YieldRequest::Yield);
                }
            }
        }
        if registry.is_thunk(cpu.regs.eip) {
            match dispatch_stub(cpu, mmu, registry, state) {
                Ok(()) => continue,
                Err(e) => {
                    #[cfg(feature = "trace")]
                    emit_trap_event(cpu, mmu, &e);
                    return Err(e);
                }
            }
        }
        match cpu.step(mmu) {
            Ok(StepOk::Continued) => continue,
            Ok(StepOk::Halted) => {
                // The active thread executed a `ret` whose
                // popped address was `RET_SENTINEL`. For the
                // bootstrap thread that's the run's exit; for
                // any other thread it means the thread proc
                // returned, so we mark it Terminated and let
                // the scheduler pick the next runnable peer.
                if state.active_tid == 1 {
                    return Ok(());
                }
                cpu.regs.eip = RET_SENTINEL;
                let dead_tid = state.active_tid;
                state.cur_thread_mut().status = crate::sched::ThreadStatus::Terminated;
                on_thread_terminated(state, dead_tid);
                schedule_next_thread(cpu, state);
                continue;
            }
            Err(t) => {
                let e: crate::Error = t.into();
                #[cfg(feature = "trace")]
                emit_trap_event(cpu, mmu, &e);
                return Err(e);
            }
        }
    }
}

/// Trace-feature-gated: format the trap variant + register
/// snapshot and push one `kind=trap` JSONL event.
#[cfg(feature = "trace")]
fn emit_trap_event(cpu: &Cpu, mmu: &Mmu, err: &crate::Error) {
    use crate::emulator::regs::Reg32;
    let (label, eip, opcode) = match err {
        crate::Error::Trap(t) => match t {
            crate::emulator::Trap::MemoryFault { addr } => ("MemoryFault", *addr, None::<u32>),
            crate::emulator::Trap::ReadProtectFault { addr } => ("ReadProtectFault", *addr, None),
            crate::emulator::Trap::WriteProtectFault { addr } => ("WriteProtectFault", *addr, None),
            crate::emulator::Trap::ExecuteProtectFault { addr } => {
                ("ExecuteProtectFault", *addr, None)
            }
            crate::emulator::Trap::UndefinedOpcode { eip, opcode } => {
                ("UndefinedOpcode", *eip, Some(*opcode))
            }
            crate::emulator::Trap::PrivilegedOpcode { eip, .. } => ("PrivilegedOpcode", *eip, None),
            crate::emulator::Trap::DivideByZero { eip } => ("DivideByZero", *eip, None),
            crate::emulator::Trap::UnresolvedImport { .. } => {
                ("UnresolvedImport", cpu.regs.eip, None)
            }
            crate::emulator::Trap::InstructionLimitExceeded { eip, .. } => {
                ("InstructionLimitExceeded", *eip, None)
            }
            crate::emulator::Trap::UnimplementedMmx { eip, opcode, .. } => {
                ("UnimplementedMmx", *eip, Some(*opcode))
            }
        },
        crate::Error::PeLoader(_) => ("PeLoader", cpu.regs.eip, None),
        crate::Error::Win32(_) => ("Win32", cpu.regs.eip, None),
        crate::Error::NotImplemented => ("NotImplemented", cpu.regs.eip, None),
    };
    let regs = [
        ("eax", cpu.regs.get32(Reg32::Eax)),
        ("ecx", cpu.regs.get32(Reg32::Ecx)),
        ("edx", cpu.regs.get32(Reg32::Edx)),
        ("ebx", cpu.regs.get32(Reg32::Ebx)),
        ("esp", cpu.regs.esp()),
        ("ebp", cpu.regs.get32(Reg32::Ebp)),
        ("esi", cpu.regs.get32(Reg32::Esi)),
        ("edi", cpu.regs.get32(Reg32::Edi)),
    ];
    mmu.trace.ev_trap(label, eip, opcode, &regs);
}

/// Push args right-to-left, push the synthetic `RET_SENTINEL`,
/// jump to `target_va`, run the emulator until it returns,
/// and report the final `eax` value.
///
/// This is the building block both `Sandbox::call_dll_main`
/// and the round-2 `vfw32` stub surface use to invoke an
/// exported guest function with stdcall calling convention.
/// On entry, `cpu.regs.eip` may be anything; on exit it is
/// the popped return address (= `RET_SENTINEL`). Caller-saved
/// registers are not preserved beyond what the guest callee
/// preserves itself.
pub fn call_guest(
    cpu: &mut Cpu,
    mmu: &mut Mmu,
    registry: &Registry,
    state: &mut HostState,
    target_va: u32,
    args: &[u32],
) -> Result<u32, crate::Error> {
    use crate::emulator::isa_int::RET_SENTINEL;
    use crate::emulator::regs::Reg32;
    // Push args right-to-left.
    for a in args.iter().rev() {
        cpu.push32(mmu, *a)?;
    }
    cpu.push32(mmu, RET_SENTINEL)?;
    cpu.regs.eip = target_va;
    run_until_sentinel(cpu, mmu, registry, state)?;
    Ok(cpu.regs.get32(Reg32::Eax))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::emulator::{mmu::Perm, Mmu};

    fn dummy_stub(
        _cpu: &mut Cpu,
        _mmu: &mut Mmu,
        _h: &mut HostState,
        _r: &Registry,
    ) -> Result<u32, Win32Error> {
        Ok(0xCAFE)
    }

    #[test]
    fn registry_assigns_stable_thunk_addresses() {
        let mut r = Registry::new();
        let a = r.register("kernel32.dll", "Foo", dummy_stub, 1);
        let b = r.register("kernel32.dll", "Bar", dummy_stub, 0);
        let a2 = r.register("kernel32.dll", "Foo", dummy_stub, 1);
        assert_eq!(a, a2);
        assert_ne!(a, b);
        assert!(r.is_thunk(a));
    }

    #[test]
    fn registry_resolve_is_case_insensitive_on_dll_name() {
        let mut r = Registry::new();
        let addr = r.register("KERNEL32.DLL", "GetProcessHeap", dummy_stub, 0);
        assert_eq!(r.resolve("kernel32.dll", "GetProcessHeap"), Some(addr));
        assert_eq!(r.resolve("Kernel32.Dll", "GetProcessHeap"), Some(addr));
    }

    #[test]
    fn cdecl_trace_arg_count_covers_msvcrt_heap_surface() {
        // Single-arg msvcrt cdecl entries.
        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "malloc"), Some(1));
        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "free"), Some(1));
        assert_eq!(
            cdecl_trace_arg_count("msvcrt.dll", "??2@YAPAXI@Z"),
            Some(1),
            "operator new",
        );
        assert_eq!(
            cdecl_trace_arg_count("msvcrt.dll", "??3@YAXPAX@Z"),
            Some(1),
            "operator delete",
        );
        // Two-arg msvcrt cdecl entries (pre-declared for future
        // calloc / realloc registrations).
        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "calloc"), Some(2));
        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "realloc"), Some(2));
    }

    #[test]
    fn cdecl_trace_arg_count_returns_none_for_unknown_calls() {
        assert_eq!(
            cdecl_trace_arg_count("kernel32.dll", "GetProcessHeap"),
            None
        );
        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "memcpy"), None);
        assert_eq!(
            cdecl_trace_arg_count("MSVCRT.DLL", "malloc"),
            None,
            "match is exact-case on dll string per registry contract"
        );
    }

    #[cfg(feature = "trace")]
    #[test]
    fn dispatch_emits_size_arg_for_msvcrt_malloc() {
        use std::sync::{Arc, Mutex};

        // Capture sink shared between TraceState (owns Box<dyn Write>)
        // and the test (reads back the JSONL line).
        struct CapSink(Arc<Mutex<Vec<u8>>>);
        impl std::io::Write for CapSink {
            fn write(&mut self, b: &[u8]) -> std::io::Result<usize> {
                self.0.lock().unwrap().extend_from_slice(b);
                Ok(b.len())
            }
            fn flush(&mut self) -> std::io::Result<()> {
                Ok(())
            }
        }
        let buf = Arc::new(Mutex::new(Vec::new()));

        // Bring up an MMU + CPU + registry exactly as a real
        // dispatch would see them.
        let mut mmu = Mmu::new();
        mmu.map(0x4000, 0x4000, Perm::R | Perm::W);
        mmu.trace.set_sink(Box::new(CapSink(Arc::clone(&buf))));

        let mut cpu = Cpu::new();
        cpu.regs.set_esp(0x7000);

        let mut registry = Registry::new();
        // Register a dummy malloc-shaped stub at the msvcrt slot.
        // The stub returns a known pointer (the value the trace
        // event records as `ret`); the SIZE arg comes from the
        // stack at [esp+4] and must surface as `args:[2928]`.
        fn dummy_malloc_stub(
            _cpu: &mut Cpu,
            _mmu: &mut Mmu,
            _h: &mut HostState,
            _r: &Registry,
        ) -> Result<u32, Win32Error> {
            Ok(0x6000_0000)
        }
        let addr = registry.register("msvcrt.dll", "malloc", dummy_malloc_stub, 0);

        // Cdecl call frame: ret addr at [esp], size at [esp+4].
        // 2928 == 0xb70 — matches the auditor reference value.
        cpu.push32(&mut mmu, 2928).unwrap(); // arg0 (size)
        cpu.push32(&mut mmu, 0x1c218058).unwrap(); // saved ret addr (call-site EIP)

        cpu.regs.eip = addr;
        let mut state = HostState::new(0, 0);
        dispatch_stub(&mut cpu, &mut mmu, &registry, &mut state).unwrap();

        // The captured JSONL line should carry args:[2928] (decimal,
        // matching the existing ev_win32_call format), the dummy
        // pointer in `ret`, and the call-site EIP (NOT the thunk).
        let s = String::from_utf8(buf.lock().unwrap().clone()).unwrap();
        assert!(s.contains(r#""kind":"win32_call""#), "line: {s}");
        assert!(s.contains(r#""dll":"msvcrt.dll""#), "line: {s}");
        assert!(s.contains(r#""name":"malloc""#), "line: {s}");
        assert!(
            s.contains(r#""args":[2928]"#),
            "expected args:[2928] (== 0xb70), got: {s}",
        );
        assert!(s.contains(r#""ret":"0x60000000""#), "line: {s}");
        assert!(s.contains(r#""eip":"0x1c218058""#), "line: {s}");
    }

    #[cfg(feature = "trace")]
    #[test]
    fn dispatch_emits_pointer_arg_for_msvcrt_operator_delete() {
        use std::sync::{Arc, Mutex};
        struct CapSink(Arc<Mutex<Vec<u8>>>);
        impl std::io::Write for CapSink {
            fn write(&mut self, b: &[u8]) -> std::io::Result<usize> {
                self.0.lock().unwrap().extend_from_slice(b);
                Ok(b.len())
            }
            fn flush(&mut self) -> std::io::Result<()> {
                Ok(())
            }
        }
        let buf = Arc::new(Mutex::new(Vec::new()));
        let mut mmu = Mmu::new();
        mmu.map(0x4000, 0x4000, Perm::R | Perm::W);
        mmu.trace.set_sink(Box::new(CapSink(Arc::clone(&buf))));
        let mut cpu = Cpu::new();
        cpu.regs.set_esp(0x7000);
        let mut registry = Registry::new();
        fn dummy_delete_stub(
            _cpu: &mut Cpu,
            _mmu: &mut Mmu,
            _h: &mut HostState,
            _r: &Registry,
        ) -> Result<u32, Win32Error> {
            Ok(0)
        }
        let addr = registry.register("msvcrt.dll", "??3@YAXPAX@Z", dummy_delete_stub, 0);
        cpu.push32(&mut mmu, 0x6000_02c0).unwrap(); // ptr arg
        cpu.push32(&mut mmu, 0x1c237e58).unwrap(); // saved ret
        cpu.regs.eip = addr;
        let mut state = HostState::new(0, 0);
        dispatch_stub(&mut cpu, &mut mmu, &registry, &mut state).unwrap();
        let s = String::from_utf8(buf.lock().unwrap().clone()).unwrap();
        assert!(
            s.contains(r#""args":[1610613440]"#),
            "expected args:[1610613440] (== 0x600002c0), got: {s}",
        );
        assert!(s.contains(r#""name":"??3@YAXPAX@Z""#), "line: {s}");
    }

    #[test]
    fn dispatch_pops_return_addr_and_args() {
        let mut mmu = Mmu::new();
        mmu.map(0x4000, 0x4000, Perm::R | Perm::W);
        let mut cpu = Cpu::new();
        cpu.regs.set_esp(0x7000);

        let mut registry = Registry::new();
        let addr = registry.register("kernel32.dll", "Sample", dummy_stub, 2);

        // Lay out a fake call frame: ret addr, arg1, arg2.
        cpu.push32(&mut mmu, 0x4444).unwrap(); // arg2
        cpu.push32(&mut mmu, 0x3333).unwrap(); // arg1
        cpu.push32(&mut mmu, 0x2222).unwrap(); // saved ret addr
        let esp_before = cpu.regs.esp();

        cpu.regs.eip = addr;
        let mut state = HostState::new(0, 0);
        dispatch_stub(&mut cpu, &mut mmu, &registry, &mut state).unwrap();

        // After: eax=0xCAFE, eip = ret addr, esp pops 12 bytes
        // total (1 ret + 2 args).
        assert_eq!(cpu.regs.get32(crate::emulator::regs::Reg32::Eax), 0xCAFE);
        assert_eq!(cpu.regs.eip, 0x2222);
        assert_eq!(cpu.regs.esp(), esp_before + 12);
    }
}