Skip to main content

ud_emulator/win32/
mod.rs

1//! Win32 stub registry + per-DLL host implementations of the
2//! functions the loaded codec DLLs import.
3//!
4//! Each stub is a Rust function pointer with the signature
5//! [`StubFn`]. The PE loader, when populating the IAT, looks up
6//! `(dll_name_lowercased, function_name)` in [`Registry`] and
7//! writes the synthetic `StubAddr` (a guest address that lives
8//! in the unmapped "thunk space" near `0xFFFE_0000`) into the
9//! IAT slot.
10//!
11//! At call time, the integer ISA executor sees `eip` jump to a
12//! thunk address. It detects this via [`Registry::is_thunk`]
13//! and dispatches to the stub directly, popping the right number
14//! of bytes off the guest stack for the calling convention.
15//!
16//! All stubs are stdcall (callee-cleanup) for round 1; the
17//! `arg_dwords` field carries the count. Round-2 will add cdecl
18//! (caller-cleanup) once vfw32 needs it.
19//!
20//! Reference for each function: the corresponding MSDN page
21//! (linked in source comments next to each stub).
22
23use std::collections::BTreeMap;
24
25use crate::emulator::{Cpu, Mmu};
26
27pub mod advapi32;
28pub mod comctl32;
29pub mod gdi32;
30pub mod kernel32;
31pub mod mfplat;
32pub mod msi;
33pub mod msiexec;
34pub mod msvcrt;
35pub mod ole32;
36pub mod shell32;
37pub mod shlwapi;
38pub mod user32;
39pub mod version;
40pub mod vfw32;
41pub mod winmm;
42
43/// First synthetic thunk address. Chosen well above any plausible
44/// `ImageBase + section.VirtualAddress` so it cannot be mistaken
45/// for a real DLL byte. Each registered stub gets the next
46/// 16-byte slot.
47pub const THUNK_BASE: u32 = 0xFFFE_0000;
48const THUNK_STRIDE: u32 = 16;
49
50/// Signature every Win32 stub uses.
51///
52/// Returns the dword to put in `eax` on return. The stub
53/// internally reads its arguments off the guest stack via the
54/// [`Cpu`] / [`Mmu`] handles. The runtime takes care of popping
55/// `arg_dwords * 4` bytes from the guest stack after the stub
56/// returns (stdcall callee-cleanup).
57///
58/// `&Registry` is passed so a stub can re-enter the run-loop to
59/// call back into the guest (used by the round-2 `vfw32` stub
60/// surface, which has to dispatch the codec DLL's `DriverProc`
61/// before returning to the IAT caller).
62pub type StubFn = fn(&mut Cpu, &mut Mmu, &mut HostState, &Registry) -> Result<u32, Win32Error>;
63
64/// One stub call recorded for analysis. Populated whenever
65/// [`HostState::trace_stubs`] is set; the [`HostState::stub_calls`]
66/// vector accumulates these in call order.
67#[derive(Clone, Debug)]
68pub struct StubCall {
69    /// The DLL the call targeted (`"kernel32.dll"`, …).
70    pub dll: String,
71    /// The function name (`"CreateFileA"`, …).
72    pub name: String,
73    /// Dword arguments captured off the guest stack at call
74    /// entry, before the stub ran. Length is the stdcall
75    /// `arg_dwords` count, or a per-call override for known
76    /// cdecl shapes.
77    pub args: Vec<u32>,
78    /// Whatever `eax` value the stub returned.
79    pub ret: u32,
80    /// Call-site EIP — the saved return address on the guest
81    /// stack at call entry, i.e. the instruction the codec
82    /// will resume at when the stub returns.
83    pub call_site_eip: u32,
84}
85
86/// Information stored alongside each stub.
87#[derive(Clone)]
88pub struct StubEntry {
89    pub dll: String,
90    pub name: String,
91    pub func: StubFn,
92    /// Number of dword arguments to pop off the stack (stdcall
93    /// callee-cleanup). cdecl callers will be added in round 2
94    /// with a separate flag.
95    pub arg_dwords: u32,
96    /// The synthetic guest address that, when called, invokes
97    /// this stub.
98    pub thunk_addr: u32,
99}
100
101/// Errors a stub can raise. Wrapped in `crate::Error::Win32`.
102#[derive(Debug, Clone, PartialEq, Eq)]
103pub enum Win32Error {
104    /// No stub registered for the requested `(dll, name)` pair.
105    /// PE-load-time error; surfaces from
106    /// `crate::pe::Loader::resolve_imports`.
107    UnknownImport { dll: String, name: String },
108    /// Stub-side argument validation failed.
109    InvalidArgument { stub: &'static str, reason: String },
110    /// Heap call referenced an unknown allocation.
111    InvalidHeapBlock { stub: &'static str, addr: u32 },
112    /// The per-run instruction budget set on
113    /// [`HostState::instruction_budget`] was exhausted before
114    /// the guest reached `RET_SENTINEL`. Analysis front-ends
115    /// use this to cap adversarial samples that loop. The
116    /// state captured up to the budget point — coverage
117    /// map, stub trace, register snapshot — is still valid.
118    BudgetExhausted { executed: u64 },
119}
120
121impl core::fmt::Display for Win32Error {
122    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
123        match self {
124            Win32Error::UnknownImport { dll, name } => {
125                write!(f, "no Round-1 stub for import {dll}!{name}")
126            }
127            Win32Error::InvalidArgument { stub, reason } => {
128                write!(f, "{stub}: {reason}")
129            }
130            Win32Error::InvalidHeapBlock { stub, addr } => {
131                write!(f, "{stub}: unknown heap allocation {addr:#010x}")
132            }
133            Win32Error::BudgetExhausted { executed } => {
134                write!(
135                    f,
136                    "instruction budget exhausted after {executed} steps without reaching RET_SENTINEL"
137                )
138            }
139        }
140    }
141}
142
143/// One entry in the open-codec table — a "Handle to Installable
144/// Compressor" in MSDN's vfw32 vocabulary.
145#[derive(Debug, Clone)]
146pub struct HicEntry {
147    /// 4-byte fcc type ('VIDC' for video).
148    pub fcc_type: u32,
149    /// 4-byte fcc handler ('cvid' for Cinepak, 'IV50' for Indeo 5).
150    pub fcc_handler: u32,
151    /// Open mode (vfw.h: 1 = ICMODE_COMPRESS, 2 = ICMODE_DECOMPRESS, …).
152    pub mode: u32,
153    /// VA of the codec DLL's `DriverProc` export (the entry point
154    /// that every IC* call dispatches into).
155    pub driver_proc_va: u32,
156    /// `dwDriverId` to pass back to `DriverProc` on every call —
157    /// the value `DriverProc(_, _, DRV_OPEN, _, _)` returned.
158    pub driver_id: u32,
159}
160
161/// Per-process emulator state. Phase 1 of the scheduler refactor
162/// (see `magical-popping-oasis` plan): every field that is
163/// conceptually scoped to a single Win32 *process* lives here.
164/// Today there is exactly one [`ProcessState`] per [`HostState`];
165/// Phase 5 will switch this to a `BTreeMap<pid, ProcessState>`
166/// indexed by an `active_pid` cursor, which keeps stub bodies
167/// unchanged (they reach state through [`HostState`]'s
168/// auto-deref).
169///
170/// The conceptual boundary: anything a child process spawned via
171/// `CreateProcessA` should NOT share with its parent goes here.
172/// Anything truly shared (virtual filesystem, virtual registry,
173/// host-side trace buffers, the clock, the instruction budget)
174/// stays on [`HostState`].
175#[derive(Default)]
176pub struct ProcessState {
177    /// Synthetic process identifier. `1` for the bootstrap
178    /// process; `CreateProcessA` mints monotonically
179    /// increasing values.
180    pub pid: u32,
181    /// PID of the process that called `CreateProcessA` to spawn
182    /// this one. `0` for the bootstrap (no parent).
183    pub parent_pid: u32,
184    /// Image base where the process's primary PE is mapped.
185    /// Each process gets a unique base so child PEs don't
186    /// collide with the parent — `0x00400000` for the bootstrap,
187    /// `0x10000000` / `0x20000000` / … for spawned children.
188    pub image_base: u32,
189    /// Exit code reported by `ExitProcess` / `TerminateProcess`
190    /// on this process; `None` while the process is still
191    /// running. Wakes any `WaitForSingleObject` on the process
192    /// handle when set.
193    pub exit_code: Option<u32>,
194    /// Heap allocations keyed by guest address.
195    pub heap: BTreeMap<u32, Vec<u8>>,
196    /// Cursor for the next heap allocation. Walks through a
197    /// dedicated guest-virtual region (configured by [`HostState::new`]).
198    pub heap_cursor: u32,
199    pub heap_arena_end: u32,
200    /// Default process heap handle returned by `GetProcessHeap`.
201    pub process_heap_handle: u32,
202    /// Loaded-module registry: name → ImageBase.
203    pub modules: BTreeMap<String, u32>,
204    /// Most-recently-loaded codec module's image base — returned
205    /// by `GetModuleHandleA(NULL)`. Set to 0 if no DLL has been
206    /// loaded yet.
207    pub primary_module_base: u32,
208    /// Open codec handles. Synthesised inside the host (no codec
209    /// guest memory is consumed); each handle is a small integer
210    /// the codec sees as an `HIC`.
211    pub hics: BTreeMap<u32, HicEntry>,
212    /// Counter for the next synthetic HIC. Starts at 1; 0 means
213    /// "open failed".
214    pub next_hic: u32,
215    /// Default `DriverProc` VA used when a host caller invokes an
216    /// `IC*` stub but has not staged a real codec image (i.e. for
217    /// the no-fixture unit tests). Set to 0 when no codec is
218    /// loaded — `ICOpen` then refuses to mint a HIC.
219    pub default_driver_proc: u32,
220    /// Set by `kernel32!ExitProcess` (and `TerminateProcess` on
221    /// the current process) to break out of the emulator loop in
222    /// lieu of unwinding to `RET_SENTINEL`. `Some(code)` means
223    /// "this process asked to terminate"; the run-loop converts
224    /// this into a clean return so the calling host code can
225    /// introspect what happened.
226    pub exit_requested: Option<u32>,
227    /// Read-only constant-data arena. Used by stubs like
228    /// `GetCommandLineA` / `GetEnvironmentStrings` that need to
229    /// hand out stable guest pointers to canned strings. The
230    /// slab grows by `arena_const_alloc` and lives at
231    /// `[const_arena_start, const_arena_end)`. Configured by
232    /// [`HostState::new`] like the heap arena.
233    pub const_arena_cursor: u32,
234    pub const_arena_end: u32,
235    /// Cached pointer to the canned `"oxideav-vfw\0"` command
236    /// line. Lazily populated by `GetCommandLineA`.
237    pub command_line_ptr: u32,
238    /// Cached pointer to the canned empty environment block.
239    pub environment_strings_ptr: u32,
240    /// Currently-live `HDC` values handed out by
241    /// `gdi32!CreateCompatibleDC` / `user32!GetDC`. `None` until
242    /// the first DC is allocated, then a populated set.
243    pub gdi_hdcs: Option<std::collections::BTreeSet<u32>>,
244    /// Round 26 — synthetic `HWND` registry. `CreateWindowExA`
245    /// hands out `HWND_BASE + n` values; `IsWindow` consults this
246    /// set; `DestroyWindow` removes from it. None of these HWNDs
247    /// back a real window — DirectShow / VfW codecs only need the
248    /// `HWND` value to feel non-NULL so they fall through to
249    /// their headless code path.
250    pub hwnd_registry: std::collections::BTreeSet<u32>,
251    /// Counter for the next synthetic HWND allocation. Starts at
252    /// 0; first HWND handed out is `HWND_BASE + 0`.
253    pub next_hwnd_index: u32,
254    /// Set of `DriverProc` VAs that have already received the
255    /// one-time `DRV_LOAD` + `DRV_ENABLE` initialisation pair.
256    /// Round 11 — without this, `IR50_32.DLL`'s `DRV_LOAD` handler
257    /// (which allocates the codec's huffman / inverse-DCT tables
258    /// at `[0x1009c770]`) never runs, and `ICDecompress` later
259    /// reads `[0x1009c770] == NULL` and bails with
260    /// `ICERR_BADIMAGE`. We track per-VA so multi-codec sandboxes
261    /// (round 12+) don't double-load the same driver.
262    pub loaded_drivers: std::collections::BTreeSet<u32>,
263    /// Per-loaded-module resource directory location:
264    /// `image_base → resource_dir_va`. Empty if the module has no
265    /// `.rsrc` (PE Data Directory entry 2). Round 12 needs this so
266    /// that `kernel32!FindResourceA` on `IR50_32.DLL` can locate
267    /// the RT_BITMAP/112 entry that holds the codec's huffman /
268    /// inverse-DCT tables. Without it the codec's `DRV_LOAD`
269    /// chain bails at `0x10034d31 (jz 0x10034f61)` and
270    /// `[0x1009c770]` stays NULL.
271    pub module_resource_dirs: BTreeMap<u32, u32>,
272    /// Round 25 — host-side bookkeeping for COM objects the
273    /// guest has handed back to the test harness (live class
274    /// factories + IBaseFilter pointers etc).  See
275    /// [`crate::com::ComObjectTable`] for the data layout.  Each
276    /// `ole32!CoCreateInstance` and every test-side
277    /// `query_interface` / `add_ref` / `release` updates this
278    /// table so a missing `Release` surfaces as a non-zero
279    /// `total_refcount()` at end-of-test.
280    pub com: crate::com::ComObjectTable,
281    /// Round 55 — PRNG state for `msvcrt!rand` calls from
282    /// sandboxed codec code.  Default `1` matches MSVC's
283    /// documented "no `srand` called yet" initial state.
284    /// Updated by both `msvcrt!srand(seed)` (from guest code)
285    /// and by `Sandbox::set_rand_seed` / `with_rand_seed` (from
286    /// host code) — they share the same field so the host can
287    /// observe what the codec did to the state, and the codec
288    /// can override the host-staged seed via its own `srand`.
289    /// LCG step (Knuth-style, mod 2^32, output bits 30..16
290    /// masked to 15 bits per MSVC's documented contract):
291    ///
292    /// ```text
293    /// state = state * 214013 + 2531011  (mod 2^32)
294    /// rand  = (state >> 16) & 0x7FFF
295    /// ```
296    pub rand_state: u32,
297    /// Cursor for the next `TlsAlloc` slot. Slot indices are
298    /// process-scoped (each `TlsAlloc` mints a fresh integer)
299    /// but the *values* stored at those indices live in
300    /// per-thread [`ThreadState::tls_slots`]. Phase 2 of the
301    /// scheduler refactor.
302    pub next_tls_slot: u32,
303    /// Bottom of the thread-stack pool. `CreateThread` carves
304    /// stacks from `[bottom, top)` walking down from
305    /// `next_thread_stack_top`. Both are `0` when no pool has
306    /// been configured — `CreateThread` reports an
307    /// `InvalidArgument` error in that case.
308    pub thread_stack_pool_bottom: u32,
309    /// Next available stack-top for the next `CreateThread`.
310    /// Decrements by [`THREAD_STACK_SIZE`] per spawned thread.
311    pub next_thread_stack_top: u32,
312    /// Bottom of the per-thread TIB (Thread Information Block)
313    /// pool. `CreateThread` carves a 4 KiB TIB region per
314    /// spawned thread. Phase 6 of the scheduler refactor.
315    pub tib_pool_bottom: u32,
316    /// Next available TIB base. Increments by
317    /// [`THREAD_TIB_SIZE`] per spawned thread.
318    pub next_tib_addr: u32,
319}
320
321/// Per-thread TIB size, in bytes. Real Windows uses a much
322/// larger TEB (~4 KiB minimum); we only need enough room for
323/// the handful of fields installer / codec CRTs actually
324/// touch (SEH chain head at 0x00, self pointer at 0x18,
325/// LastError at 0x34, …).
326pub const THREAD_TIB_SIZE: u32 = 0x0000_1000;
327
328/// Stride between consecutive child PE image bases. 256 MiB
329/// per child gives plenty of room for sections + heap + stack
330/// + TIB without colliding with adjacent processes.
331pub const CHILD_IMAGE_STRIDE: u32 = 0x1000_0000;
332
333/// Per-child-process heap arena size. Each spawned child
334/// carves this much from the host's child-heap pool.
335pub const CHILD_HEAP_SIZE: u32 = 0x0100_0000; // 16 MiB
336
337/// Default per-thread stack size, in bytes. 64 KiB matches the
338/// typical Win32 reserve size; many codec / installer threads
339/// use only a few hundred bytes.
340pub const THREAD_STACK_SIZE: u32 = 0x0001_0000;
341
342impl ProcessState {
343    /// Construct a fresh process with the heap arena at
344    /// `[heap_start, heap_end)` (caller is responsible for
345    /// mapping that region in the MMU as R+W).
346    #[must_use]
347    pub fn new(heap_start: u32, heap_end: u32) -> Self {
348        ProcessState {
349            pid: 1,
350            heap_cursor: heap_start,
351            heap_arena_end: heap_end,
352            process_heap_handle: 0xDEAD_BEEF,
353            next_hic: 1,
354            rand_state: 1,
355            ..ProcessState::default()
356        }
357    }
358}
359
360/// Per-thread emulator state. Phase 2 of the scheduler refactor:
361/// the live `Cpu` in [`crate::Sandbox`] continues to drive
362/// execution, but every thread-local Win32 surface (TLS slots,
363/// priority, parked CPU register file from past quanta) lives
364/// here. Phase 3 will swap the live `Cpu` value with
365/// `ThreadState::parked_cpu` on context switch.
366///
367/// The `parked_cpu` field is `None` for the currently-running
368/// thread (its register file lives on `Sandbox::cpu`) and
369/// `Some(cpu)` for any thread that has been suspended,
370/// preempted, or is waiting on a synchronization object.
371pub struct ThreadState {
372    /// Synthetic thread identifier. The first thread is `1`;
373    /// `CreateThread` mints monotonically increasing values.
374    pub tid: u32,
375    /// Owning process. For Phase 2 every thread maps to
376    /// process `1`.
377    pub pid: u32,
378    /// Windows thread priority axis. Default is
379    /// `THREAD_PRIORITY_NORMAL = 0`. Range `-15..15` for the
380    /// realtime / idle extremes.
381    pub priority: i32,
382    /// Map of TLS slot → value, set by `TlsSetValue` and read
383    /// by `TlsGetValue`. Slot indices come from
384    /// [`ProcessState::next_tls_slot`].
385    pub tls_slots: BTreeMap<u32, u32>,
386    /// Parked register file. Phase 3 will populate this when
387    /// the scheduler swaps out the live `Cpu`.
388    pub parked_cpu: Option<Cpu>,
389    /// Quantum remaining for the scheduler's current slice.
390    /// Phase 4 will tick this down; for now it's a placeholder
391    /// initialised to the default quantum.
392    pub quantum_remaining: u32,
393    /// Lifecycle state — driven by the scheduler.
394    pub status: crate::sched::ThreadStatus,
395    /// Active wait, if `status == Waiting`. Cleared on wake.
396    pub wait: Option<crate::sched::WaitCondition>,
397    /// Per-thread TIB (Thread Information Block) base — guest
398    /// VA the thread's CPU references via FS:[0]. `0` for the
399    /// bootstrap thread (which uses the runtime's shared
400    /// `TEB_BASE`); `CreateThread` carves a fresh page out of
401    /// the per-process TIB pool for each new thread.
402    pub tib_addr: u32,
403}
404
405impl Default for ThreadState {
406    fn default() -> Self {
407        ThreadState {
408            tid: 0,
409            pid: 0,
410            priority: 0,
411            tls_slots: BTreeMap::new(),
412            parked_cpu: None,
413            quantum_remaining: DEFAULT_QUANTUM,
414            status: crate::sched::ThreadStatus::Ready,
415            wait: None,
416            tib_addr: 0,
417        }
418    }
419}
420
421impl ThreadState {
422    /// Construct a fresh thread bound to the given process.
423    #[must_use]
424    pub fn new(tid: u32, pid: u32) -> Self {
425        ThreadState {
426            tid,
427            pid,
428            ..ThreadState::default()
429        }
430    }
431}
432
433/// Default scheduler quantum, in guest instructions. Phase 4
434/// will start consulting this; until then the value is purely
435/// informational so `ThreadState::quantum_remaining` has a
436/// sensible initial value.
437pub const DEFAULT_QUANTUM: u32 = 10_000;
438
439/// The host-side state every stub may read or mutate.
440///
441/// This is the "operating system" of the sandbox — the heap, the
442/// LastError TLS, the pseudo-tick counter, the loaded-module
443/// registry, etc. One per emulator instance.
444///
445/// `HostState` is the union of (a) **truly shared** state
446/// (virtual filesystem, virtual registry, trace buffers, clock,
447/// instruction budget) and (b) the **current process**'s
448/// [`ProcessState`], exposed through [`std::ops::Deref`] so that
449/// existing stub bodies that read `state.heap` / `state.modules`
450/// / etc. continue to compile unchanged. The split prepares
451/// Phase 5 of the scheduler refactor (`CreateProcessA` spawning
452/// a real child PE) without churning every Win32 stub today.
453pub struct HostState {
454    /// Process table keyed by PID. The bootstrap process has
455    /// `pid = 1`; `CreateProcessA` mints children with
456    /// monotonically increasing PIDs. The
457    /// [`std::ops::Deref`] / [`std::ops::DerefMut`] impls on
458    /// `HostState` resolve `state.heap` / `state.modules` /
459    /// etc. through the *active* process, so stub bodies
460    /// continue to compile unchanged.
461    pub processes: BTreeMap<u32, ProcessState>,
462    /// PID of the currently-running process — points into
463    /// [`Self::processes`]. The scheduler updates this when
464    /// switching threads across process boundaries.
465    pub active_pid: u32,
466    /// Cursor for the next `CreateProcessA` to mint a PID.
467    pub next_pid: u32,
468    /// Image base for the next child PE loaded via
469    /// `CreateProcessA`. `0` until a child-image arena is
470    /// configured (via [`Self::with_child_image_arena`]); the
471    /// runtime walks the cursor forward by [`CHILD_IMAGE_STRIDE`]
472    /// per spawn.
473    pub next_child_image_base: u32,
474    /// Heap-arena pool the next child process carves its
475    /// per-process heap from. `[next_child_heap_base,
476    /// child_heap_arena_end)`. Each spawn takes
477    /// [`CHILD_HEAP_SIZE`] bytes.
478    pub next_child_heap_base: u32,
479    pub child_heap_arena_end: u32,
480    /// Thread table keyed by TID. Phase 2 of the scheduler
481    /// refactor: there is always at least one thread, with
482    /// `tid = 1`, owning the live `Cpu` on `Sandbox`. Phase 3
483    /// will populate more entries when `CreateThread` mints a
484    /// real thread, and the scheduler will move the live `Cpu`
485    /// in/out of `parked_cpu` here on context switch.
486    pub threads: BTreeMap<u32, ThreadState>,
487    /// TID of the currently-executing thread — points into
488    /// [`Self::threads`]. Phase 3 will mutate this on context
489    /// switch.
490    pub active_tid: u32,
491    /// Cursor for the next `CreateThread` to mint a TID. Stays
492    /// monotonic across the lifetime of the sandbox.
493    pub next_tid: u32,
494    /// Last error code (`SetLastError` / `GetLastError`). Phase
495    /// 6 will mirror this through the per-thread TIB at
496    /// FS:[0x34] so guest code reading it directly sees the
497    /// per-thread value. For now (single thread) the field on
498    /// HostState is the source of truth.
499    pub last_error: u32,
500    /// Lazily-allocated guest address of the C-CRT `errno` cell.
501    /// `None` until the first call to `msvcrt::_errno`, then
502    /// stable for the lifetime of the sandbox so repeated calls
503    /// return the same pointer (the contract `int * _errno(void)`
504    /// requires).
505    pub errno_cell: Option<u32>,
506    /// Pseudo-tick counter incremented on every `GetTickCount`.
507    pub tick: u32,
508    /// Lines that the codec wrote to `OutputDebugString*`. Tests
509    /// can introspect to confirm a known string was emitted.
510    pub debug_log: Vec<String>,
511    /// Lines that the codec wrote to `MessageBoxA` (also mirrored
512    /// to `eprintln!`). Distinct from `debug_log` so a test can
513    /// distinguish OutputDebugStringA traffic from real popups.
514    pub message_box_log: Vec<String>,
515    /// Optional per-run instruction budget. Decremented at each
516    /// top-of-loop iteration in [`run_until_sentinel`] (both
517    /// instruction steps and stub dispatches count). When it
518    /// hits zero the run loop bails with
519    /// [`Win32Error::BudgetExhausted`] so adversarial guests
520    /// can't loop the host. `None` (the default) keeps the
521    /// historical unbounded behaviour.
522    pub instruction_budget: Option<u64>,
523    /// Counts how many instructions actually ran in the last
524    /// (or current) run-loop session. Useful for the analysis
525    /// front-ends to report "ran for N instructions, budget
526    /// was M". Reset to zero on each top-level run entry.
527    pub instructions_executed: u64,
528    /// Optional emulation-context layer (virtual filesystem,
529    /// virtual registry, future surfaces). When `None`, the
530    /// Win32 stubs that would consult it fall through to
531    /// their fail-soft default. See [`crate::context::Context`]
532    /// for the contract.
533    pub context: crate::context::Context,
534    /// When `true`, [`dispatch_stub`] appends one line per Win32
535    /// call to [`HostState::stub_trace`]. Off by default; round-8 tests flip
536    /// it on while triaging which stub returns a bad value.
537    pub trace_stubs: bool,
538    /// Per-call trace lines populated when [`HostState::trace_stubs`] is on.
539    /// Format: `dll!name(arg0, arg1, …) → 0xRET`. The args are
540    /// the first `arg_dwords` (or, for known cdecl shapes, the
541    /// override from [`cdecl_trace_arg_count`]) dwords off the
542    /// guest stack, captured BEFORE the stub mutates them.
543    pub stub_trace: Vec<String>,
544    /// Structured per-call log, populated when [`HostState::trace_stubs`]
545    /// is on. Parallel to [`HostState::stub_trace`]; analysis front-ends
546    /// (the `ud analyze` JSON output) consume this directly so
547    /// they don't have to re-parse the formatted string.
548    pub stub_calls: Vec<StubCall>,
549    /// Scheduler-owned wait-object table + global instruction
550    /// clock. Phase 3 of the scheduler refactor.
551    pub scheduler: crate::sched::Scheduler,
552    /// When `Some`, the most recently dispatched stub asked the
553    /// run loop to switch threads. The run loop drains the
554    /// field after every stub return: a `Wait(...)` moves the
555    /// current thread to `Waiting`; `Yield` re-queues it at the
556    /// end of the Ready queue; `Exit { code }` terminates it.
557    pub yield_requested: Option<crate::sched::YieldRequest>,
558}
559
560impl Default for HostState {
561    fn default() -> Self {
562        let mut threads = BTreeMap::new();
563        threads.insert(1, ThreadState::new(1, 1));
564        let mut processes = BTreeMap::new();
565        let mut p = ProcessState::default();
566        p.pid = 1;
567        processes.insert(1, p);
568        HostState {
569            processes,
570            active_pid: 1,
571            next_pid: 2,
572            next_child_image_base: 0,
573            next_child_heap_base: 0,
574            child_heap_arena_end: 0,
575            threads,
576            active_tid: 1,
577            next_tid: 2,
578            last_error: 0,
579            errno_cell: None,
580            tick: 0,
581            debug_log: Vec::new(),
582            message_box_log: Vec::new(),
583            instruction_budget: None,
584            instructions_executed: 0,
585            context: crate::context::Context::default(),
586            trace_stubs: false,
587            stub_trace: Vec::new(),
588            stub_calls: Vec::new(),
589            scheduler: crate::sched::Scheduler::new(),
590            yield_requested: None,
591        }
592    }
593}
594
595impl std::ops::Deref for HostState {
596    type Target = ProcessState;
597    fn deref(&self) -> &ProcessState {
598        self.processes
599            .get(&self.active_pid)
600            .expect("active_pid must always point to a live process")
601    }
602}
603
604impl std::ops::DerefMut for HostState {
605    fn deref_mut(&mut self) -> &mut ProcessState {
606        self.processes
607            .get_mut(&self.active_pid)
608            .expect("active_pid must always point to a live process")
609    }
610}
611
612impl HostState {
613    /// Construct a HostState with the heap arena at `[heap_start,
614    /// heap_end)` (caller is responsible for mapping that region
615    /// in the MMU as R+W).
616    ///
617    /// The const-arena (used for canned strings handed back from
618    /// `GetCommandLineA` / `GetEnvironmentStrings` / etc.) is
619    /// **not** allocated here — call [`Self::with_const_arena`]
620    /// to set it up if those stubs are exercised. Tests that
621    /// don't use them can leave it at zero.
622    pub fn new(heap_start: u32, heap_end: u32) -> Self {
623        let mut s = HostState::default();
624        s.processes
625            .insert(1, ProcessState::new(heap_start, heap_end));
626        s
627    }
628
629    /// Borrow the active process. Resolved through
630    /// [`Self::active_pid`].
631    #[must_use]
632    pub fn cur_process(&self) -> &ProcessState {
633        self.processes
634            .get(&self.active_pid)
635            .expect("active_pid must always point to a live process")
636    }
637
638    /// Mutable borrow of the active process. Pair to
639    /// [`Self::cur_process`]; same invariant.
640    pub fn cur_process_mut(&mut self) -> &mut ProcessState {
641        self.processes
642            .get_mut(&self.active_pid)
643            .expect("active_pid must always point to a live process")
644    }
645
646    /// Borrow a process by PID, if it exists.
647    #[must_use]
648    pub fn process(&self, pid: u32) -> Option<&ProcessState> {
649        self.processes.get(&pid)
650    }
651
652    /// Mutable borrow of a process by PID.
653    pub fn process_mut(&mut self, pid: u32) -> Option<&mut ProcessState> {
654        self.processes.get_mut(&pid)
655    }
656
657    /// Borrow the currently-running thread. Falls back to the
658    /// bootstrap thread (`tid = 1`) on the freshly-constructed
659    /// state.
660    #[must_use]
661    pub fn cur_thread(&self) -> &ThreadState {
662        self.threads
663            .get(&self.active_tid)
664            .expect("active_tid must always point to a live thread (Default initialises tid 1)")
665    }
666
667    /// Mutable borrow of the currently-running thread. Pair to
668    /// [`Self::cur_thread`]; same invariant.
669    pub fn cur_thread_mut(&mut self) -> &mut ThreadState {
670        self.threads
671            .get_mut(&self.active_tid)
672            .expect("active_tid must always point to a live thread (Default initialises tid 1)")
673    }
674
675    /// Configure the const-arena (region for canned read-only
676    /// strings handed back to the codec). `[start, end)` is a
677    /// guest-virtual range the caller has already mapped R+W
678    /// (the arena bytes are written via `write_initializer`,
679    /// so any page perms suffice as long as the page is mapped).
680    pub fn with_const_arena(mut self, start: u32, end: u32) -> Self {
681        let p = self.cur_process_mut();
682        p.const_arena_cursor = start;
683        p.const_arena_end = end;
684        self
685    }
686
687    /// Configure the thread-stack pool. `CreateThread` carves
688    /// per-thread stacks from the top of this region walking
689    /// downward. `[bottom, top)` must already be mapped R+W in
690    /// the MMU.
691    pub fn with_thread_stack_pool(mut self, bottom: u32, top: u32) -> Self {
692        let p = self.cur_process_mut();
693        p.thread_stack_pool_bottom = bottom;
694        p.next_thread_stack_top = top;
695        self
696    }
697
698    /// Configure the per-thread TIB pool. `CreateThread` carves
699    /// 4 KiB TIB regions out of `[bottom, top)` walking upward.
700    /// Both ends must already be mapped R+W in the MMU. The
701    /// bootstrap thread continues to use the runtime's shared
702    /// `TEB_BASE`; only spawned threads consume this pool.
703    pub fn with_tib_pool(mut self, bottom: u32, top: u32) -> Self {
704        let p = self.cur_process_mut();
705        p.tib_pool_bottom = bottom;
706        p.next_tib_addr = bottom;
707        let _ = top; // explicit upper bound is informational
708        self
709    }
710
711    /// Configure the child-process pools: image-base cursor
712    /// + heap arena. `CreateProcessA` carves a child PE into
713    /// `[image_base, image_base + CHILD_IMAGE_STRIDE)` and a
714    /// 16 MiB heap out of `[heap_start, heap_end)`.
715    pub fn with_child_arena(
716        mut self,
717        image_base_cursor: u32,
718        heap_start: u32,
719        heap_end: u32,
720    ) -> Self {
721        self.next_child_image_base = image_base_cursor;
722        self.next_child_heap_base = heap_start;
723        self.child_heap_arena_end = heap_end;
724        self
725    }
726
727    /// Bump-allocate `n` bytes in the const arena. Returns the
728    /// guest address of the new slab. The caller is responsible
729    /// for [`Mmu::write_initializer`]'ing the contents.
730    pub fn arena_const_alloc(&mut self, n: u32) -> Result<u32, Win32Error> {
731        let aligned =
732            n.checked_add(15)
733                .map(|v| v & !15u32)
734                .ok_or_else(|| Win32Error::InvalidArgument {
735                    stub: "arena_const_alloc",
736                    reason: format!("size overflow: requested {n} (≈ {n:#x})"),
737                })?;
738        let addr = self.const_arena_cursor;
739        let next = addr
740            .checked_add(aligned)
741            .ok_or(Win32Error::InvalidArgument {
742                stub: "arena_const_alloc",
743                reason: "const arena address-space overflow".into(),
744            })?;
745        if next > self.const_arena_end {
746            return Err(Win32Error::InvalidArgument {
747                stub: "arena_const_alloc",
748                reason: format!(
749                    "const arena exhausted (need {n}, have {})",
750                    self.const_arena_end - addr
751                ),
752            });
753        }
754        self.const_arena_cursor = next;
755        Ok(addr)
756    }
757
758    /// Allocate a fresh slab in the heap arena and return its
759    /// guest address. Used by the round-2 marshalling helpers to
760    /// stage `ICDECOMPRESS` / `BITMAPINFOHEADER` / raw-frame
761    /// buffers in guest memory before calling `DriverProc`.
762    pub fn arena_alloc(&mut self, n: u32) -> Result<u32, Win32Error> {
763        let aligned =
764            n.checked_add(15)
765                .map(|v| v & !15u32)
766                .ok_or_else(|| Win32Error::InvalidArgument {
767                    stub: "arena_alloc",
768                    reason: format!("size overflow: requested {n} (≈ {n:#x})"),
769                })?;
770        let addr = self.heap_cursor;
771        let next = addr
772            .checked_add(aligned)
773            .ok_or(Win32Error::InvalidArgument {
774                stub: "arena_alloc",
775                reason: "heap address-space overflow".into(),
776            })?;
777        if next > self.heap_arena_end {
778            return Err(Win32Error::InvalidArgument {
779                stub: "arena_alloc",
780                reason: format!(
781                    "arena exhausted (need {n}, have {})",
782                    self.heap_arena_end - addr
783                ),
784            });
785        }
786        self.heap_cursor = next;
787        self.heap.insert(addr, vec![0u8; n as usize]);
788        Ok(addr)
789    }
790}
791
792/// Stub registry. Created once per emulator instance.
793#[derive(Default)]
794pub struct Registry {
795    by_thunk: BTreeMap<u32, StubEntry>,
796    by_name: BTreeMap<(String, String), u32>,
797    next_slot: u32,
798    /// Per-(dll, name) **data imports**. Some CRT symbols are
799    /// imported by name but are read as data (e.g.
800    /// `msvcrt!_adjust_fdiv`, an `int` flag the FDIV-erratum
801    /// fix-up code consults). The PE loader treats their IAT
802    /// slots as `mov ecx, [iat]; mov edx, [ecx]` — the IAT
803    /// slot is the address OF a 4-byte int, not a function
804    /// pointer. We pre-allocate a small read/write region for
805    /// these and patch the IAT slot to its address. The
806    /// `(value)` is whatever the symbol is documented to hold;
807    /// 0 is the safe default.
808    data_imports: BTreeMap<(String, String), DataImport>,
809    /// Bump cursor in the data-import slot region (assigned
810    /// addresses live in `[DATA_IMPORT_BASE, DATA_IMPORT_BASE +
811    /// DATA_IMPORT_SIZE)`).
812    next_data_slot: u32,
813}
814
815/// One data-import slot, addressed via [`Registry::resolve`].
816#[derive(Clone, Copy, Debug)]
817pub struct DataImport {
818    /// Guest address of the 4-byte slot. The PE loader patches
819    /// the IAT entry with this value.
820    pub addr: u32,
821    /// Initial value to seed into `[addr]` at first slot
822    /// allocation. Subsequent registrations of the same name
823    /// keep the prior value.
824    pub initial: u32,
825}
826
827/// Region reserved for data-import slots — see [`DataImport`].
828/// 4 KiB is plenty: the entire CRT data-import set is fewer
829/// than 16 dwords across all codecs we expect to load.
830pub const DATA_IMPORT_BASE: u32 = 0x7010_0000;
831const DATA_IMPORT_SIZE: u32 = 0x0000_1000;
832const DATA_IMPORT_END: u32 = DATA_IMPORT_BASE + DATA_IMPORT_SIZE;
833
834impl Registry {
835    pub fn new() -> Self {
836        Registry {
837            by_thunk: BTreeMap::new(),
838            by_name: BTreeMap::new(),
839            next_slot: 0,
840            data_imports: BTreeMap::new(),
841            next_data_slot: DATA_IMPORT_BASE,
842        }
843    }
844
845    /// Register a data import — a 4-byte symbol the codec
846    /// reads via `mov reg, [iat]; mov reg, [reg]`. Returns
847    /// the guest address that the IAT slot should point at.
848    /// Subsequent calls with the same `(dll, name)` return the
849    /// previously assigned slot.
850    pub fn register_data(&mut self, dll: &str, name: &str, initial: u32) -> u32 {
851        let key = (dll.to_ascii_lowercase(), name.to_string());
852        if let Some(d) = self.data_imports.get(&key) {
853            return d.addr;
854        }
855        let addr = self.next_data_slot;
856        let next = addr.saturating_add(4);
857        if next > DATA_IMPORT_END {
858            // Caller asked to register more data imports than
859            // we reserved space for. Return 0 — the loader
860            // handles "unresolved" by falling back to a thunk
861            // that will trap loudly.
862            return 0;
863        }
864        self.next_data_slot = next;
865        self.data_imports.insert(key, DataImport { addr, initial });
866        // Also expose it through the by-name resolver so the
867        // PE loader's ordinary lookup picks it up. The
868        // returned address is in the data region (not a thunk
869        // — `is_thunk(addr)` will correctly return false).
870        self.by_name
871            .insert((dll.to_ascii_lowercase(), name.to_string()), addr);
872        addr
873    }
874
875    /// Iterate the registered data imports. The PE loader uses
876    /// this to seed each slot's `initial` value into MMU memory
877    /// after the data-import region has been mapped.
878    pub fn data_imports(&self) -> impl Iterator<Item = (&String, &String, &DataImport)> {
879        self.data_imports
880            .iter()
881            .map(|((dll, name), d)| (dll, name, d))
882    }
883
884    /// Register a stub. Returns the synthetic thunk address that
885    /// the IAT slot should be populated with.
886    pub fn register(&mut self, dll: &str, name: &str, func: StubFn, arg_dwords: u32) -> u32 {
887        let key = (dll.to_ascii_lowercase(), name.to_string());
888        if let Some(addr) = self.by_name.get(&key) {
889            return *addr;
890        }
891        let thunk_addr = THUNK_BASE.wrapping_add(self.next_slot.wrapping_mul(THUNK_STRIDE));
892        self.next_slot += 1;
893        self.by_name.insert(key.clone(), thunk_addr);
894        self.by_thunk.insert(
895            thunk_addr,
896            StubEntry {
897                dll: key.0,
898                name: key.1,
899                func,
900                arg_dwords,
901                thunk_addr,
902            },
903        );
904        thunk_addr
905    }
906
907    /// Resolve an import. The PE loader uses this when populating
908    /// IAT slots. `dll_name` is matched case-insensitively.
909    pub fn resolve(&self, dll: &str, name: &str) -> Option<u32> {
910        let key = (dll.to_ascii_lowercase(), name.to_string());
911        self.by_name.get(&key).copied()
912    }
913
914    /// Register a fail-soft fallback thunk for an import we
915    /// don't have a stub for. The thunk's stub function looks
916    /// itself up in the registry and raises
917    /// [`crate::emulator::Trap::UnresolvedImport`] carrying
918    /// the (dll, name) pair on first call.
919    ///
920    /// The PE loader's fail-soft mode installs one of these
921    /// for every unresolved IAT entry so loading succeeds and
922    /// execution proceeds until the first unknown API actually
923    /// gets called. That's a much better signal than failing
924    /// at load time: the trap names the specific function to
925    /// implement next, and reveals which import paths are
926    /// reachable from the entry point.
927    pub fn register_unknown_fallback(&mut self, dll: &str, name: &str) -> u32 {
928        let key = (dll.to_ascii_lowercase(), name.to_string());
929        if let Some(addr) = self.by_name.get(&key) {
930            return *addr;
931        }
932        let thunk_addr = THUNK_BASE.wrapping_add(self.next_slot.wrapping_mul(THUNK_STRIDE));
933        self.next_slot += 1;
934        self.by_name.insert(key.clone(), thunk_addr);
935        // arg_dwords=0 is wrong for most stdcall APIs but
936        // doesn't matter — the stub traps before returning so
937        // dispatch_stub never reaches the stack-cleanup path.
938        self.by_thunk.insert(
939            thunk_addr,
940            StubEntry {
941                dll: key.0,
942                name: key.1,
943                func: stub_unresolved_fallback,
944                arg_dwords: 0,
945                thunk_addr,
946            },
947        );
948        thunk_addr
949    }
950
951    /// True iff `addr` is a registered thunk address.
952    pub fn is_thunk(&self, addr: u32) -> bool {
953        self.by_thunk.contains_key(&addr)
954    }
955
956    /// Look up the stub entry by its thunk address. Used by the
957    /// runtime when it sees `eip == thunk_addr`.
958    pub fn entry(&self, addr: u32) -> Option<&StubEntry> {
959        self.by_thunk.get(&addr)
960    }
961
962    /// Convenience: register every kernel32 stub. Returns the
963    /// number of stubs registered.
964    pub fn register_kernel32(&mut self) -> usize {
965        let before = self.by_name.len();
966        kernel32::register(self);
967        self.by_name.len() - before
968    }
969
970    /// Register every gdi32 stub. Returns the number registered.
971    pub fn register_gdi32(&mut self) -> usize {
972        let before = self.by_name.len();
973        gdi32::register(self);
974        self.by_name.len() - before
975    }
976
977    /// Register every user32 stub. Returns the number registered.
978    pub fn register_user32(&mut self) -> usize {
979        let before = self.by_name.len();
980        user32::register(self);
981        self.by_name.len() - before
982    }
983
984    /// Register every winmm stub. Returns the number registered.
985    pub fn register_winmm(&mut self) -> usize {
986        let before = self.by_name.len();
987        winmm::register(self);
988        self.by_name.len() - before
989    }
990
991    /// Register every advapi32 stub. Returns the number registered.
992    pub fn register_advapi32(&mut self) -> usize {
993        let before = self.by_name.len();
994        advapi32::register(self);
995        self.by_name.len() - before
996    }
997
998    /// Register every ole32 stub. Returns the number registered.
999    pub fn register_ole32(&mut self) -> usize {
1000        let before = self.by_name.len();
1001        ole32::register(self);
1002        self.by_name.len() - before
1003    }
1004
1005    /// Register every msvcrt stub. Returns the number registered.
1006    pub fn register_msvcrt(&mut self) -> usize {
1007        let before = self.by_name.len();
1008        msvcrt::register(self);
1009        self.by_name.len() - before
1010    }
1011
1012    /// Register the msvcrt stub set under `msvcr71.dll`. Used by
1013    /// codecs from the wmfdist11 era (mp43decd, mp4sdecd,
1014    /// wmvdecod, …) that link MSVC 7.1's runtime by its
1015    /// per-version name. Returns the number registered.
1016    pub fn register_msvcr71(&mut self) -> usize {
1017        let before = self.by_name.len();
1018        msvcrt::register_alias(self, "msvcr71.dll");
1019        self.by_name.len() - before
1020    }
1021
1022    /// Register the msvcrt stub set under `pncrt.dll`. Used by
1023    /// RealNetworks codecs that ship their own CRT fork.
1024    /// Returns the number registered.
1025    pub fn register_pncrt(&mut self) -> usize {
1026        let before = self.by_name.len();
1027        msvcrt::register_alias(self, "pncrt.dll");
1028        self.by_name.len() - before
1029    }
1030
1031    /// Register the msvcrt stub set under `msvcr80.dll` (Visual
1032    /// Studio 2005 CRT). Used by `camstudio-1.4-camcodec.dll`.
1033    pub fn register_msvcr80(&mut self) -> usize {
1034        let before = self.by_name.len();
1035        msvcrt::register_alias(self, "msvcr80.dll");
1036        self.by_name.len() - before
1037    }
1038
1039    /// Register the msvcrt stub set under `msvcr90.dll` (Visual
1040    /// Studio 2008 CRT). Used by `camstudio-1.5-camcodec.dll`.
1041    pub fn register_msvcr90(&mut self) -> usize {
1042        let before = self.by_name.len();
1043        msvcrt::register_alias(self, "msvcr90.dll");
1044        self.by_name.len() - before
1045    }
1046
1047    /// Register every mfplat (Media Foundation platform) stub.
1048    /// Returns the number registered.
1049    pub fn register_mfplat(&mut self) -> usize {
1050        let before = self.by_name.len();
1051        mfplat::register(self);
1052        self.by_name.len() - before
1053    }
1054
1055    /// Register every msi.dll stub — Windows Installer surface
1056    /// touched by application installers (QuickTime, …).
1057    /// Returns the number registered.
1058    pub fn register_msi(&mut self) -> usize {
1059        let before = self.by_name.len();
1060        msi::register(self);
1061        self.by_name.len() - before
1062    }
1063
1064    /// Register the version.dll / comctl32.dll / shell32.dll /
1065    /// shlwapi.dll stub families — the config-dialog and
1066    /// settings-file surface VfW codecs pull in alongside their
1067    /// decode core. Returns the number registered.
1068    pub fn register_shell_support(&mut self) -> usize {
1069        let before = self.by_name.len();
1070        version::register(self);
1071        comctl32::register(self);
1072        shell32::register(self);
1073        shlwapi::register(self);
1074        self.by_name.len() - before
1075    }
1076
1077    /// Register every Round-1+4+8+20 stub family in one call:
1078    /// kernel32, gdi32, user32, winmm, advapi32, ole32, msvcrt,
1079    /// plus the round-27 host-COM thunk family used by
1080    /// [`crate::com::mint_host_filter_graph`].  Returns the total
1081    /// number registered.
1082    pub fn register_all(&mut self) -> usize {
1083        let host_before = self.by_name.len();
1084        crate::com::host_iface::register(self);
1085        crate::com::host_iface_r31::register(self);
1086        let host_count = self.by_name.len() - host_before;
1087        self.register_kernel32()
1088            + self.register_gdi32()
1089            + self.register_user32()
1090            + self.register_winmm()
1091            + self.register_advapi32()
1092            + self.register_ole32()
1093            + self.register_msvcrt()
1094            + self.register_msvcr71()
1095            + self.register_pncrt()
1096            + self.register_msvcr80()
1097            + self.register_msvcr90()
1098            + self.register_mfplat()
1099            + self.register_msi()
1100            + self.register_shell_support()
1101            + host_count
1102    }
1103}
1104
1105/// Read the `n`-th stdcall dword argument off the guest stack.
1106///
1107/// At entry, `esp` points to the saved return address (pushed by
1108/// the caller's CALL); the first argument is at `esp+4`, the
1109/// second at `esp+8`, etc.
1110pub fn arg_dword(cpu: &Cpu, mmu: &Mmu, n: u32) -> Result<u32, crate::emulator::Trap> {
1111    let addr = cpu.regs.esp().wrapping_add(4u32 * (n + 1));
1112    mmu.load32(addr)
1113}
1114
1115/// Cdecl arg-count override table for trace-event extraction.
1116///
1117/// Stdcall stubs already declare their argument count in
1118/// [`StubEntry::arg_dwords`] (the value the dispatch site uses
1119/// to pop the stack on return). Cdecl stubs declare `0` because
1120/// the *caller* cleans the stack — but the args are still on the
1121/// stack at call entry. For known-shape cdecl entries we return
1122/// the per-call dword count so the trace probe can read those
1123/// dwords back into `args[]` on `kind=win32_call` events.
1124///
1125/// Returns `None` if the `(dll, name)` pair has no override; in
1126/// that case the trace site falls back to the registered
1127/// `arg_dwords` (0 for any cdecl stub, leaving `args:[]` as
1128/// before — so this is purely additive).
1129///
1130/// Reference: `docs/video/msmpeg4/audit/06-sandbox-O3-quant-init.md`
1131/// §5.2.3 — Auditor needs allocation sizes surfaced at call
1132/// time so the codec-context allocation can be located by size
1133/// match rather than by return-address differencing.
1134pub fn cdecl_trace_arg_count(dll: &str, name: &str) -> Option<u32> {
1135    match (dll, name) {
1136        // Heap surface — single-arg shapes.
1137        //   void* malloc(size_t)                              — 1
1138        //   void  free(void*)                                 — 1
1139        //   void* operator new(unsigned int)  ??2@YAPAXI@Z    — 1
1140        //   void  operator delete(void*)      ??3@YAXPAX@Z    — 1
1141        ("msvcrt.dll", "malloc")
1142        | ("msvcrt.dll", "free")
1143        | ("msvcrt.dll", "??2@YAPAXI@Z")
1144        | ("msvcrt.dll", "??3@YAXPAX@Z") => Some(1),
1145        // Two-arg shapes — not registered today but cheap to
1146        // pre-declare so a future `register("msvcrt.dll",
1147        // "calloc"/"realloc", ...)` automatically gets traced
1148        // args without revisiting this table.
1149        //   void* calloc(size_t count, size_t size)           — 2
1150        //   void* realloc(void*, size_t)                      — 2
1151        ("msvcrt.dll", "calloc") | ("msvcrt.dll", "realloc") => Some(2),
1152        _ => None,
1153    }
1154}
1155
1156/// Convert an MMU/CPU [`crate::emulator::Trap`] into a [`Win32Error`]
1157/// so a stub's argument-fetch failure surfaces as
1158/// `Win32Error::InvalidArgument`. Used by the gdi32 / user32 /
1159/// winmm modules.
1160pub fn trap_to_win32_local(stub: &'static str, t: crate::emulator::Trap) -> Win32Error {
1161    Win32Error::InvalidArgument {
1162        stub,
1163        reason: format!("{t}"),
1164    }
1165}
1166
1167/// Read a NUL-terminated 8-bit string from guest memory at `addr`,
1168/// stopping at NUL or after `max` bytes. Used by user32/winmm
1169/// stubs that take an `LPCSTR`.
1170pub fn read_cstr_local(mmu: &Mmu, mut addr: u32, max: u32) -> Result<String, Win32Error> {
1171    let mut bytes = Vec::new();
1172    for _ in 0..max {
1173        let b = mmu
1174            .load8(addr)
1175            .map_err(|t| trap_to_win32_local("read_cstr", t))?;
1176        if b == 0 {
1177            break;
1178        }
1179        bytes.push(b);
1180        addr = addr.wrapping_add(1);
1181    }
1182    Ok(String::from_utf8_lossy(&bytes).into_owned())
1183}
1184
1185/// Read a NUL-terminated UTF-16 string from guest memory at
1186/// `addr`, stopping at NUL or after `max_chars` 16-bit code
1187/// units. Used by every `*W` Win32 stub that takes an
1188/// `LPCWSTR`.
1189pub fn read_wide_cstr_local(mmu: &Mmu, mut addr: u32, max_chars: u32) -> String {
1190    let mut units = Vec::new();
1191    for _ in 0..max_chars {
1192        match mmu.load16(addr) {
1193            Ok(0) => break,
1194            Ok(u) => units.push(u),
1195            Err(_) => break,
1196        }
1197        addr = addr.wrapping_add(2);
1198    }
1199    String::from_utf16_lossy(&units)
1200}
1201
1202/// Stub function used by [`Registry::register_unknown_fallback`].
1203/// Looks up its own (dll, name) by reverse-resolving the entry
1204/// EIP against the registry and raises a
1205/// [`Win32Error::UnknownImport`] that the runtime surfaces as
1206/// `Trap::UnresolvedImport`. Execution halts on first call —
1207/// the operator sees the precise import to implement next.
1208fn stub_unresolved_fallback(
1209    cpu: &mut Cpu,
1210    _mmu: &mut Mmu,
1211    _state: &mut HostState,
1212    registry: &Registry,
1213) -> Result<u32, Win32Error> {
1214    let addr = cpu.regs.eip;
1215    let (dll, name) = registry
1216        .entry(addr)
1217        .map(|e| (e.dll.clone(), e.name.clone()))
1218        .unwrap_or_else(|| ("<unknown>".to_string(), format!("@{addr:#010x}")));
1219    Err(Win32Error::UnknownImport { dll, name })
1220}
1221
1222/// Dispatch a stub call. The runtime wires this into the executor
1223/// so that whenever `eip` lands on a thunk address, control
1224/// transfers here instead of fetching instruction bytes.
1225///
1226/// On entry: the guest CALL has already pushed the return
1227/// address; `eip` is the thunk address. On exit: `eax` holds the
1228/// stub's return value, `eip` is the popped return address, and
1229/// `arg_dwords*4` bytes have been removed from the stack
1230/// (stdcall callee-cleanup).
1231pub fn dispatch_stub(
1232    cpu: &mut Cpu,
1233    mmu: &mut Mmu,
1234    registry: &Registry,
1235    state: &mut HostState,
1236) -> Result<(), crate::Error> {
1237    let addr = cpu.regs.eip;
1238    let entry = registry
1239        .entry(addr)
1240        .ok_or_else(|| Win32Error::UnknownImport {
1241            dll: "<thunk>".into(),
1242            name: format!("@{:#010x}", addr),
1243        })?
1244        .clone();
1245    // Snapshot the call-site EIP (= the saved return address
1246    // pushed by the guest CALL — the instruction right after
1247    // the CALL, not the thunk address) and the first few args
1248    // off the guest stack BEFORE running the stub, since the
1249    // stub mutates the stack.
1250    //
1251    // Argument count: `entry.arg_dwords` carries the stdcall
1252    // count (the value used to pop the stack on return). For
1253    // cdecl stubs this is 0 — but for known cdecl shapes
1254    // (msvcrt heap entries) [`cdecl_trace_arg_count`] supplies a
1255    // per-call override so the trace surfaces the size / pointer
1256    // args rather than `args:[]`.
1257    //
1258    // The snapshot is always-on when `state.trace_stubs` is set
1259    // (the structured `stub_calls` vector consumes it) and is
1260    // additionally emitted as a JSONL event under the `trace`
1261    // feature flag.
1262    let capture_args = state.trace_stubs;
1263    #[cfg(feature = "trace")]
1264    let capture_args = capture_args || mmu.trace.has_sink();
1265    let snapshot: Option<(u32, Vec<u32>)> = if capture_args {
1266        let call_site_eip = mmu.load32(cpu.regs.esp()).unwrap_or(0);
1267        let n_args = cdecl_trace_arg_count(&entry.dll, &entry.name).unwrap_or(entry.arg_dwords);
1268        let mut args = Vec::with_capacity(n_args as usize);
1269        for i in 0..n_args {
1270            let a = arg_dword(cpu, mmu, i).unwrap_or(0);
1271            args.push(a);
1272        }
1273        Some((call_site_eip, args))
1274    } else {
1275        None
1276    };
1277    // Run the host-side stub.
1278    let ret = (entry.func)(cpu, mmu, state, registry)?;
1279    if state.trace_stubs {
1280        let (call_site_eip, args) = snapshot.clone().unwrap_or((0, Vec::new()));
1281        let args_str = args
1282            .iter()
1283            .map(|a| format!("{a:#010x}"))
1284            .collect::<Vec<_>>()
1285            .join(", ");
1286        state.stub_trace.push(format!(
1287            "{}!{}({args_str}) → {:#010x}",
1288            entry.dll, entry.name, ret
1289        ));
1290        state.stub_calls.push(StubCall {
1291            dll: entry.dll.clone(),
1292            name: entry.name.clone(),
1293            args,
1294            ret,
1295            call_site_eip,
1296        });
1297    }
1298    // Emit the trace event with the captured args + the actual
1299    // return value. Done before stack unwind so the EIP we log
1300    // is the call site, not the post-return PC.
1301    #[cfg(feature = "trace")]
1302    if let Some((call_site_eip, args)) = snapshot {
1303        mmu.trace
1304            .ev_win32_call(&entry.dll, &entry.name, &args, ret, call_site_eip);
1305    }
1306    // stdcall: pop return address, advance esp by arg_dwords*4,
1307    // set eax to the return value.
1308    let ret_addr = cpu.pop32(mmu)?;
1309    cpu.regs.set32(crate::emulator::regs::Reg32::Eax, ret);
1310    let new_esp = cpu
1311        .regs
1312        .esp()
1313        .wrapping_add(entry.arg_dwords.wrapping_mul(4));
1314    cpu.regs.set_esp(new_esp);
1315    cpu.regs.eip = ret_addr;
1316    Ok(())
1317}
1318
1319/// Run the emulator until `eip == RET_SENTINEL`, dispatching to
1320/// any Win32 stub thunk addresses encountered along the way.
1321///
1322/// This is the shared run-loop body used both by [`crate::Sandbox`]
1323/// and by re-entrant host stubs (notably the `vfw32` surface,
1324/// which dispatches the codec's `DriverProc` synchronously
1325/// inside an outer `IC*` call).
1326/// Process a yield request from a freshly-returned stub. The
1327/// active thread transitions to the requested scheduler state,
1328/// then the run loop picks the next `Ready` thread via
1329/// [`schedule_next_thread`]. The live `Cpu` is parked into the
1330/// previous active thread's `parked_cpu` slot and the new
1331/// thread's parked Cpu replaces it.
1332fn handle_yield(cpu: &mut Cpu, state: &mut HostState, req: crate::sched::YieldRequest) {
1333    use crate::sched::{ThreadStatus, YieldRequest};
1334    match req {
1335        YieldRequest::Wait(cond) => {
1336            let t = state.cur_thread_mut();
1337            t.status = ThreadStatus::Waiting;
1338            t.wait = Some(cond);
1339        }
1340        YieldRequest::Yield => {
1341            let t = state.cur_thread_mut();
1342            t.status = ThreadStatus::Ready;
1343        }
1344        YieldRequest::Exit { code } => {
1345            let tid = state.active_tid;
1346            let t = state.cur_thread_mut();
1347            t.status = ThreadStatus::Terminated;
1348            t.wait = None;
1349            on_thread_terminated(state, tid);
1350            // Signal any pending `WaitForSingleObject` against
1351            // this thread's Thread WaitObject. (Phase 3c will
1352            // implement the wake side; here we just mark the
1353            // state machine so Phase 3c's wake-up sees a
1354            // terminated thread).
1355            let _ = code;
1356        }
1357    }
1358    schedule_next_thread(cpu, state);
1359}
1360
1361/// Park the live `Cpu` into the current thread and resume the
1362/// next `Ready` thread (if one exists). When no other Ready
1363/// thread is available, restores the current thread's CPU
1364/// unchanged — the run loop continues with the same thread
1365/// (which is fine for single-thread Sleep behaviour: the clock
1366/// fast-forward at the top of the loop wakes the same thread).
1367fn schedule_next_thread(cpu: &mut Cpu, state: &mut HostState) {
1368    use crate::sched::ThreadStatus;
1369    // Pick the next runnable thread other than the current
1370    // one — round-robin by TID order. Phase 4 will add
1371    // priority-aware picking.
1372    let cur_tid = state.active_tid;
1373    let next_tid = {
1374        let mut candidates: Vec<(i32, u32)> = state
1375            .threads
1376            .iter()
1377            .filter(|(tid, t)| **tid != cur_tid && matches!(t.status, ThreadStatus::Ready))
1378            .map(|(tid, t)| (t.priority, *tid))
1379            .collect();
1380        // Sort by descending priority then ascending TID for
1381        // deterministic round-robin within the same priority.
1382        candidates.sort_by(|a, b| b.0.cmp(&a.0).then(a.1.cmp(&b.1)));
1383        candidates.into_iter().next().map(|(_, tid)| tid)
1384    };
1385    let cur_is_runnable = matches!(
1386        state
1387            .threads
1388            .get(&cur_tid)
1389            .map(|t| t.status)
1390            .unwrap_or(ThreadStatus::Terminated),
1391        ThreadStatus::Ready | ThreadStatus::Running
1392    );
1393    let Some(next_tid) = next_tid else {
1394        // No other Ready thread. If the current is still
1395        // runnable, just keep going. Otherwise the run loop's
1396        // sleep-clock fast-forward will wake it; if that
1397        // doesn't apply we'd deadlock — but Phase 3b only
1398        // exposes Sleep, so this path is fine.
1399        if cur_is_runnable {
1400            state.cur_thread_mut().status = ThreadStatus::Running;
1401        }
1402        return;
1403    };
1404    // Park the live CPU into the current thread.
1405    let parked = std::mem::take(cpu);
1406    if let Some(t) = state.threads.get_mut(&cur_tid) {
1407        t.parked_cpu = Some(parked);
1408    }
1409    // Restore the next thread's parked CPU into the live one.
1410    let mut new_pid = None;
1411    if let Some(t) = state.threads.get_mut(&next_tid) {
1412        if let Some(c) = t.parked_cpu.take() {
1413            *cpu = c;
1414        }
1415        t.status = ThreadStatus::Running;
1416        new_pid = Some(t.pid);
1417    }
1418    state.active_tid = next_tid;
1419    // When the new thread lives in a different process, update
1420    // `active_pid` so the Deref-resolved per-process state
1421    // (heap arena, modules, hwnd registry, …) points at the
1422    // new process. Phase 5c.
1423    if let Some(pid) = new_pid {
1424        if state.processes.contains_key(&pid) {
1425            state.active_pid = pid;
1426        }
1427    }
1428}
1429
1430/// After a thread terminates, check whether its owning
1431/// process has any live threads left. If not, record the
1432/// process's exit code (defaulting to 0 if not already set)
1433/// and wake every thread blocked on a `WaitObject::Process`
1434/// targeting that PID. Phase 5c — chains the natural Win32
1435/// "last thread out marks the process exited" contract.
1436fn on_thread_terminated(state: &mut HostState, tid: u32) {
1437    let pid = match state.threads.get(&tid) {
1438        Some(t) => t.pid,
1439        None => return,
1440    };
1441    let alive = state
1442        .threads
1443        .values()
1444        .any(|t| t.pid == pid && !matches!(t.status, crate::sched::ThreadStatus::Terminated));
1445    if alive {
1446        return;
1447    }
1448    if let Some(p) = state.processes.get_mut(&pid) {
1449        if p.exit_code.is_none() {
1450            p.exit_code = Some(0);
1451        }
1452    }
1453    // Wake every Process-handle waiter on this PID.
1454    let handles: Vec<u32> = state
1455        .scheduler
1456        .objects
1457        .iter()
1458        .filter_map(|(h, obj)| match obj {
1459            crate::sched::WaitObject::Process { pid: p } if *p == pid => Some(*h),
1460            _ => None,
1461        })
1462        .collect();
1463    for h in handles {
1464        for waiter_tid in crate::sched::waiters_on(&state.threads, h) {
1465            if let Some(t) = state.threads.get_mut(&waiter_tid) {
1466                t.status = crate::sched::ThreadStatus::Ready;
1467                t.wait = None;
1468            }
1469        }
1470    }
1471    // Same for any pending Thread-handle waits on this TID.
1472    let thread_handles: Vec<u32> = state
1473        .scheduler
1474        .objects
1475        .iter()
1476        .filter_map(|(h, obj)| match obj {
1477            crate::sched::WaitObject::Thread { tid: t } if *t == tid => Some(*h),
1478            _ => None,
1479        })
1480        .collect();
1481    for h in thread_handles {
1482        for waiter_tid in crate::sched::waiters_on(&state.threads, h) {
1483            if let Some(t) = state.threads.get_mut(&waiter_tid) {
1484                t.status = crate::sched::ThreadStatus::Ready;
1485                t.wait = None;
1486            }
1487        }
1488    }
1489}
1490
1491/// Earliest `resume_after_instructions` across every
1492/// Sleep-waiting thread, or `None` if no thread is sleeping.
1493fn earliest_sleep_resume(state: &HostState) -> Option<u64> {
1494    state
1495        .threads
1496        .values()
1497        .filter_map(|t| {
1498            if matches!(t.status, crate::sched::ThreadStatus::Waiting) {
1499                if let Some(crate::sched::WaitCondition::Sleep {
1500                    resume_after_instructions,
1501                }) = t.wait
1502                {
1503                    return Some(resume_after_instructions);
1504                }
1505            }
1506            None
1507        })
1508        .min()
1509}
1510
1511/// Move every `Waiting`-on-Sleep thread whose resume target is
1512/// in the past back to `Ready`. Called from
1513/// [`run_until_sentinel`] after the global clock advances.
1514fn wake_sleep_if_due(state: &mut HostState) {
1515    let now = state.scheduler.instructions_global;
1516    for t in state.threads.values_mut() {
1517        if matches!(t.status, crate::sched::ThreadStatus::Waiting) {
1518            if let Some(crate::sched::WaitCondition::Sleep {
1519                resume_after_instructions,
1520            }) = t.wait
1521            {
1522                if now >= resume_after_instructions {
1523                    t.status = crate::sched::ThreadStatus::Ready;
1524                    t.wait = None;
1525                }
1526            }
1527        }
1528    }
1529}
1530
1531pub fn run_until_sentinel(
1532    cpu: &mut Cpu,
1533    mmu: &mut Mmu,
1534    registry: &Registry,
1535    state: &mut HostState,
1536) -> Result<(), crate::Error> {
1537    use crate::emulator::isa_int::{StepOk, RET_SENTINEL};
1538    // Reset the per-run instruction counter so analysis
1539    // front-ends can ask "how many did this top-level call
1540    // burn?" without subtracting from a stale snapshot.
1541    state.instructions_executed = 0;
1542    loop {
1543        // Honour any yield request the most recently dispatched
1544        // stub left behind. Phase 3 of the scheduler refactor:
1545        // a `Wait`/`Yield`/`Exit` request handed up from a stub
1546        // suspends the active thread and resumes the next
1547        // `Ready` one. Until Phase 3d ships, only `Sleep` and
1548        // `Yield` (single-thread) are observable here — both
1549        // resolve as "spin until wake-up" without an actual
1550        // context switch.
1551        if let Some(req) = state.yield_requested.take() {
1552            handle_yield(cpu, state, req);
1553        }
1554        // Scheduler nudge: when the active thread isn't
1555        // runnable (Terminated / Waiting because no other
1556        // Ready thread could be picked at yield time), look
1557        // for any thread sleeping on a Sleep wait. The
1558        // earliest wake target fast-forwards the global
1559        // clock; `wake_sleep_if_due` then moves matching
1560        // threads back to Ready, and `schedule_next_thread`
1561        // switches into one of them.
1562        let active_runnable = matches!(
1563            state.cur_thread().status,
1564            crate::sched::ThreadStatus::Ready | crate::sched::ThreadStatus::Running
1565        );
1566        if !active_runnable {
1567            if let Some(earliest) = earliest_sleep_resume(state) {
1568                state.scheduler.instructions_global =
1569                    state.scheduler.instructions_global.max(earliest);
1570                wake_sleep_if_due(state);
1571                schedule_next_thread(cpu, state);
1572            }
1573            // Active thread still not runnable AND no Ready
1574            // peer was found — the run is done. Return so the
1575            // outer host caller observes a clean exit rather
1576            // than a busy spin.
1577            if !matches!(
1578                state.cur_thread().status,
1579                crate::sched::ThreadStatus::Ready | crate::sched::ThreadStatus::Running
1580            ) {
1581                cpu.regs.eip = RET_SENTINEL;
1582                return Ok(());
1583            }
1584            state.cur_thread_mut().status = crate::sched::ThreadStatus::Running;
1585        }
1586        if state.exit_requested.is_some() {
1587            // `kernel32!ExitProcess` was called. Force eip to
1588            // the sentinel so the outer caller's stack-frame
1589            // cleanup is consistent and exit cleanly.
1590            cpu.regs.eip = RET_SENTINEL;
1591            return Ok(());
1592        }
1593        if cpu.regs.eip == RET_SENTINEL {
1594            // The active thread has run off the end of its
1595            // top-level callable. If it's the bootstrap thread
1596            // (TID 1), the entire run is done. Otherwise, mark
1597            // the thread Terminated and switch to the next
1598            // Ready one.
1599            if state.active_tid == 1 {
1600                return Ok(());
1601            }
1602            let dead_tid = state.active_tid;
1603            state.cur_thread_mut().status = crate::sched::ThreadStatus::Terminated;
1604            on_thread_terminated(state, dead_tid);
1605            schedule_next_thread(cpu, state);
1606            // After the switch the live CPU points at the next
1607            // thread; if no other was Ready, we're back on the
1608            // bootstrap thread and `schedule_next_thread`
1609            // left the live CPU untouched — so we'll re-enter
1610            // this branch and return.
1611            if state.active_tid == 1
1612                && matches!(
1613                    state.cur_thread().status,
1614                    crate::sched::ThreadStatus::Ready | crate::sched::ThreadStatus::Running
1615                )
1616                && cpu.regs.eip == RET_SENTINEL
1617            {
1618                return Ok(());
1619            }
1620            continue;
1621        }
1622        // Optional instruction budget — both instruction steps
1623        // and stub dispatches are counted as one "step" each,
1624        // since either is a unit of progress the host attributed
1625        // to the guest. When the budget hits zero, bail with a
1626        // clean `BudgetExhausted` so adversarial samples can't
1627        // loop the analyser host.
1628        if let Some(remaining) = state.instruction_budget.as_mut() {
1629            if *remaining == 0 {
1630                return Err(crate::Error::Win32(Win32Error::BudgetExhausted {
1631                    executed: state.instructions_executed,
1632                }));
1633            }
1634            *remaining -= 1;
1635        }
1636        state.instructions_executed = state.instructions_executed.saturating_add(1);
1637        state.scheduler.instructions_global = state.scheduler.instructions_global.saturating_add(1);
1638        // Quantum-based preemption (Phase 4). Each executed
1639        // instruction or stub dispatch counts against the
1640        // current thread's quantum. When it hits zero, ask the
1641        // scheduler to switch — but only when there is another
1642        // Ready thread to switch to, otherwise the current
1643        // thread just keeps the floor with a fresh quantum.
1644        {
1645            let quantum_default = state.scheduler.quantum_default;
1646            let cur_tid = state.active_tid;
1647            let t = state.cur_thread_mut();
1648            if t.quantum_remaining > 0 {
1649                t.quantum_remaining -= 1;
1650            }
1651            let exhausted = t.quantum_remaining == 0;
1652            if exhausted {
1653                t.quantum_remaining = quantum_default;
1654            }
1655            if exhausted {
1656                let has_peer = state.threads.iter().any(|(tid, ts)| {
1657                    *tid != cur_tid && matches!(ts.status, crate::sched::ThreadStatus::Ready)
1658                });
1659                if has_peer {
1660                    state.yield_requested = Some(crate::sched::YieldRequest::Yield);
1661                }
1662            }
1663        }
1664        if registry.is_thunk(cpu.regs.eip) {
1665            match dispatch_stub(cpu, mmu, registry, state) {
1666                Ok(()) => continue,
1667                Err(e) => {
1668                    #[cfg(feature = "trace")]
1669                    emit_trap_event(cpu, mmu, &e);
1670                    return Err(e);
1671                }
1672            }
1673        }
1674        match cpu.step(mmu) {
1675            Ok(StepOk::Continued) => continue,
1676            Ok(StepOk::Halted) => {
1677                // The active thread executed a `ret` whose
1678                // popped address was `RET_SENTINEL`. For the
1679                // bootstrap thread that's the run's exit; for
1680                // any other thread it means the thread proc
1681                // returned, so we mark it Terminated and let
1682                // the scheduler pick the next runnable peer.
1683                if state.active_tid == 1 {
1684                    return Ok(());
1685                }
1686                cpu.regs.eip = RET_SENTINEL;
1687                let dead_tid = state.active_tid;
1688                state.cur_thread_mut().status = crate::sched::ThreadStatus::Terminated;
1689                on_thread_terminated(state, dead_tid);
1690                schedule_next_thread(cpu, state);
1691                continue;
1692            }
1693            Err(t) => {
1694                let e: crate::Error = t.into();
1695                #[cfg(feature = "trace")]
1696                emit_trap_event(cpu, mmu, &e);
1697                return Err(e);
1698            }
1699        }
1700    }
1701}
1702
1703/// Trace-feature-gated: format the trap variant + register
1704/// snapshot and push one `kind=trap` JSONL event.
1705#[cfg(feature = "trace")]
1706fn emit_trap_event(cpu: &Cpu, mmu: &Mmu, err: &crate::Error) {
1707    use crate::emulator::regs::Reg32;
1708    let (label, eip, opcode) = match err {
1709        crate::Error::Trap(t) => match t {
1710            crate::emulator::Trap::MemoryFault { addr } => ("MemoryFault", *addr, None::<u32>),
1711            crate::emulator::Trap::ReadProtectFault { addr } => ("ReadProtectFault", *addr, None),
1712            crate::emulator::Trap::WriteProtectFault { addr } => ("WriteProtectFault", *addr, None),
1713            crate::emulator::Trap::ExecuteProtectFault { addr } => {
1714                ("ExecuteProtectFault", *addr, None)
1715            }
1716            crate::emulator::Trap::UndefinedOpcode { eip, opcode } => {
1717                ("UndefinedOpcode", *eip, Some(*opcode))
1718            }
1719            crate::emulator::Trap::PrivilegedOpcode { eip, .. } => ("PrivilegedOpcode", *eip, None),
1720            crate::emulator::Trap::DivideByZero { eip } => ("DivideByZero", *eip, None),
1721            crate::emulator::Trap::UnresolvedImport { .. } => {
1722                ("UnresolvedImport", cpu.regs.eip, None)
1723            }
1724            crate::emulator::Trap::InstructionLimitExceeded { eip, .. } => {
1725                ("InstructionLimitExceeded", *eip, None)
1726            }
1727            crate::emulator::Trap::UnimplementedMmx { eip, opcode, .. } => {
1728                ("UnimplementedMmx", *eip, Some(*opcode))
1729            }
1730        },
1731        crate::Error::PeLoader(_) => ("PeLoader", cpu.regs.eip, None),
1732        crate::Error::Win32(_) => ("Win32", cpu.regs.eip, None),
1733        crate::Error::NotImplemented => ("NotImplemented", cpu.regs.eip, None),
1734    };
1735    let regs = [
1736        ("eax", cpu.regs.get32(Reg32::Eax)),
1737        ("ecx", cpu.regs.get32(Reg32::Ecx)),
1738        ("edx", cpu.regs.get32(Reg32::Edx)),
1739        ("ebx", cpu.regs.get32(Reg32::Ebx)),
1740        ("esp", cpu.regs.esp()),
1741        ("ebp", cpu.regs.get32(Reg32::Ebp)),
1742        ("esi", cpu.regs.get32(Reg32::Esi)),
1743        ("edi", cpu.regs.get32(Reg32::Edi)),
1744    ];
1745    mmu.trace.ev_trap(label, eip, opcode, &regs);
1746}
1747
1748/// Push args right-to-left, push the synthetic `RET_SENTINEL`,
1749/// jump to `target_va`, run the emulator until it returns,
1750/// and report the final `eax` value.
1751///
1752/// This is the building block both `Sandbox::call_dll_main`
1753/// and the round-2 `vfw32` stub surface use to invoke an
1754/// exported guest function with stdcall calling convention.
1755/// On entry, `cpu.regs.eip` may be anything; on exit it is
1756/// the popped return address (= `RET_SENTINEL`). Caller-saved
1757/// registers are not preserved beyond what the guest callee
1758/// preserves itself.
1759pub fn call_guest(
1760    cpu: &mut Cpu,
1761    mmu: &mut Mmu,
1762    registry: &Registry,
1763    state: &mut HostState,
1764    target_va: u32,
1765    args: &[u32],
1766) -> Result<u32, crate::Error> {
1767    use crate::emulator::isa_int::RET_SENTINEL;
1768    use crate::emulator::regs::Reg32;
1769    // Push args right-to-left.
1770    for a in args.iter().rev() {
1771        cpu.push32(mmu, *a)?;
1772    }
1773    cpu.push32(mmu, RET_SENTINEL)?;
1774    cpu.regs.eip = target_va;
1775    run_until_sentinel(cpu, mmu, registry, state)?;
1776    Ok(cpu.regs.get32(Reg32::Eax))
1777}
1778
1779#[cfg(test)]
1780mod tests {
1781    use super::*;
1782    use crate::emulator::{mmu::Perm, Mmu};
1783
1784    fn dummy_stub(
1785        _cpu: &mut Cpu,
1786        _mmu: &mut Mmu,
1787        _h: &mut HostState,
1788        _r: &Registry,
1789    ) -> Result<u32, Win32Error> {
1790        Ok(0xCAFE)
1791    }
1792
1793    #[test]
1794    fn registry_assigns_stable_thunk_addresses() {
1795        let mut r = Registry::new();
1796        let a = r.register("kernel32.dll", "Foo", dummy_stub, 1);
1797        let b = r.register("kernel32.dll", "Bar", dummy_stub, 0);
1798        let a2 = r.register("kernel32.dll", "Foo", dummy_stub, 1);
1799        assert_eq!(a, a2);
1800        assert_ne!(a, b);
1801        assert!(r.is_thunk(a));
1802    }
1803
1804    #[test]
1805    fn registry_resolve_is_case_insensitive_on_dll_name() {
1806        let mut r = Registry::new();
1807        let addr = r.register("KERNEL32.DLL", "GetProcessHeap", dummy_stub, 0);
1808        assert_eq!(r.resolve("kernel32.dll", "GetProcessHeap"), Some(addr));
1809        assert_eq!(r.resolve("Kernel32.Dll", "GetProcessHeap"), Some(addr));
1810    }
1811
1812    #[test]
1813    fn cdecl_trace_arg_count_covers_msvcrt_heap_surface() {
1814        // Single-arg msvcrt cdecl entries.
1815        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "malloc"), Some(1));
1816        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "free"), Some(1));
1817        assert_eq!(
1818            cdecl_trace_arg_count("msvcrt.dll", "??2@YAPAXI@Z"),
1819            Some(1),
1820            "operator new",
1821        );
1822        assert_eq!(
1823            cdecl_trace_arg_count("msvcrt.dll", "??3@YAXPAX@Z"),
1824            Some(1),
1825            "operator delete",
1826        );
1827        // Two-arg msvcrt cdecl entries (pre-declared for future
1828        // calloc / realloc registrations).
1829        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "calloc"), Some(2));
1830        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "realloc"), Some(2));
1831    }
1832
1833    #[test]
1834    fn cdecl_trace_arg_count_returns_none_for_unknown_calls() {
1835        assert_eq!(
1836            cdecl_trace_arg_count("kernel32.dll", "GetProcessHeap"),
1837            None
1838        );
1839        assert_eq!(cdecl_trace_arg_count("msvcrt.dll", "memcpy"), None);
1840        assert_eq!(
1841            cdecl_trace_arg_count("MSVCRT.DLL", "malloc"),
1842            None,
1843            "match is exact-case on dll string per registry contract"
1844        );
1845    }
1846
1847    #[cfg(feature = "trace")]
1848    #[test]
1849    fn dispatch_emits_size_arg_for_msvcrt_malloc() {
1850        use std::sync::{Arc, Mutex};
1851
1852        // Capture sink shared between TraceState (owns Box<dyn Write>)
1853        // and the test (reads back the JSONL line).
1854        struct CapSink(Arc<Mutex<Vec<u8>>>);
1855        impl std::io::Write for CapSink {
1856            fn write(&mut self, b: &[u8]) -> std::io::Result<usize> {
1857                self.0.lock().unwrap().extend_from_slice(b);
1858                Ok(b.len())
1859            }
1860            fn flush(&mut self) -> std::io::Result<()> {
1861                Ok(())
1862            }
1863        }
1864        let buf = Arc::new(Mutex::new(Vec::new()));
1865
1866        // Bring up an MMU + CPU + registry exactly as a real
1867        // dispatch would see them.
1868        let mut mmu = Mmu::new();
1869        mmu.map(0x4000, 0x4000, Perm::R | Perm::W);
1870        mmu.trace.set_sink(Box::new(CapSink(Arc::clone(&buf))));
1871
1872        let mut cpu = Cpu::new();
1873        cpu.regs.set_esp(0x7000);
1874
1875        let mut registry = Registry::new();
1876        // Register a dummy malloc-shaped stub at the msvcrt slot.
1877        // The stub returns a known pointer (the value the trace
1878        // event records as `ret`); the SIZE arg comes from the
1879        // stack at [esp+4] and must surface as `args:[2928]`.
1880        fn dummy_malloc_stub(
1881            _cpu: &mut Cpu,
1882            _mmu: &mut Mmu,
1883            _h: &mut HostState,
1884            _r: &Registry,
1885        ) -> Result<u32, Win32Error> {
1886            Ok(0x6000_0000)
1887        }
1888        let addr = registry.register("msvcrt.dll", "malloc", dummy_malloc_stub, 0);
1889
1890        // Cdecl call frame: ret addr at [esp], size at [esp+4].
1891        // 2928 == 0xb70 — matches the auditor reference value.
1892        cpu.push32(&mut mmu, 2928).unwrap(); // arg0 (size)
1893        cpu.push32(&mut mmu, 0x1c218058).unwrap(); // saved ret addr (call-site EIP)
1894
1895        cpu.regs.eip = addr;
1896        let mut state = HostState::new(0, 0);
1897        dispatch_stub(&mut cpu, &mut mmu, &registry, &mut state).unwrap();
1898
1899        // The captured JSONL line should carry args:[2928] (decimal,
1900        // matching the existing ev_win32_call format), the dummy
1901        // pointer in `ret`, and the call-site EIP (NOT the thunk).
1902        let s = String::from_utf8(buf.lock().unwrap().clone()).unwrap();
1903        assert!(s.contains(r#""kind":"win32_call""#), "line: {s}");
1904        assert!(s.contains(r#""dll":"msvcrt.dll""#), "line: {s}");
1905        assert!(s.contains(r#""name":"malloc""#), "line: {s}");
1906        assert!(
1907            s.contains(r#""args":[2928]"#),
1908            "expected args:[2928] (== 0xb70), got: {s}",
1909        );
1910        assert!(s.contains(r#""ret":"0x60000000""#), "line: {s}");
1911        assert!(s.contains(r#""eip":"0x1c218058""#), "line: {s}");
1912    }
1913
1914    #[cfg(feature = "trace")]
1915    #[test]
1916    fn dispatch_emits_pointer_arg_for_msvcrt_operator_delete() {
1917        use std::sync::{Arc, Mutex};
1918        struct CapSink(Arc<Mutex<Vec<u8>>>);
1919        impl std::io::Write for CapSink {
1920            fn write(&mut self, b: &[u8]) -> std::io::Result<usize> {
1921                self.0.lock().unwrap().extend_from_slice(b);
1922                Ok(b.len())
1923            }
1924            fn flush(&mut self) -> std::io::Result<()> {
1925                Ok(())
1926            }
1927        }
1928        let buf = Arc::new(Mutex::new(Vec::new()));
1929        let mut mmu = Mmu::new();
1930        mmu.map(0x4000, 0x4000, Perm::R | Perm::W);
1931        mmu.trace.set_sink(Box::new(CapSink(Arc::clone(&buf))));
1932        let mut cpu = Cpu::new();
1933        cpu.regs.set_esp(0x7000);
1934        let mut registry = Registry::new();
1935        fn dummy_delete_stub(
1936            _cpu: &mut Cpu,
1937            _mmu: &mut Mmu,
1938            _h: &mut HostState,
1939            _r: &Registry,
1940        ) -> Result<u32, Win32Error> {
1941            Ok(0)
1942        }
1943        let addr = registry.register("msvcrt.dll", "??3@YAXPAX@Z", dummy_delete_stub, 0);
1944        cpu.push32(&mut mmu, 0x6000_02c0).unwrap(); // ptr arg
1945        cpu.push32(&mut mmu, 0x1c237e58).unwrap(); // saved ret
1946        cpu.regs.eip = addr;
1947        let mut state = HostState::new(0, 0);
1948        dispatch_stub(&mut cpu, &mut mmu, &registry, &mut state).unwrap();
1949        let s = String::from_utf8(buf.lock().unwrap().clone()).unwrap();
1950        assert!(
1951            s.contains(r#""args":[1610613440]"#),
1952            "expected args:[1610613440] (== 0x600002c0), got: {s}",
1953        );
1954        assert!(s.contains(r#""name":"??3@YAXPAX@Z""#), "line: {s}");
1955    }
1956
1957    #[test]
1958    fn dispatch_pops_return_addr_and_args() {
1959        let mut mmu = Mmu::new();
1960        mmu.map(0x4000, 0x4000, Perm::R | Perm::W);
1961        let mut cpu = Cpu::new();
1962        cpu.regs.set_esp(0x7000);
1963
1964        let mut registry = Registry::new();
1965        let addr = registry.register("kernel32.dll", "Sample", dummy_stub, 2);
1966
1967        // Lay out a fake call frame: ret addr, arg1, arg2.
1968        cpu.push32(&mut mmu, 0x4444).unwrap(); // arg2
1969        cpu.push32(&mut mmu, 0x3333).unwrap(); // arg1
1970        cpu.push32(&mut mmu, 0x2222).unwrap(); // saved ret addr
1971        let esp_before = cpu.regs.esp();
1972
1973        cpu.regs.eip = addr;
1974        let mut state = HostState::new(0, 0);
1975        dispatch_stub(&mut cpu, &mut mmu, &registry, &mut state).unwrap();
1976
1977        // After: eax=0xCAFE, eip = ret addr, esp pops 12 bytes
1978        // total (1 ret + 2 args).
1979        assert_eq!(cpu.regs.get32(crate::emulator::regs::Reg32::Eax), 0xCAFE);
1980        assert_eq!(cpu.regs.eip, 0x2222);
1981        assert_eq!(cpu.regs.esp(), esp_before + 12);
1982    }
1983}