ud_emulator/coverage.rs
1//! Execution coverage tracker.
2//!
3//! Records every guest address at which an instruction was
4//! fetched and every guest address that was written by the
5//! interpreter, with the size of the access. Built into the
6//! [`Mmu`](crate::emulator::Mmu) so the recording happens at
7//! the same probe sites the trace feature uses, without
8//! requiring the `trace` feature to be enabled.
9//!
10//! The static decompiler consumes this map to:
11//!
12//! * Identify code at addresses the static function-discovery
13//! pass missed (unaligned entry points, obfuscated trampolines).
14//! * Flag pages that were written by the running code and then
15//! executed from — the classic unpacker / self-modifying-code
16//! signature. Bytes in those regions are not the static
17//! source-of-truth and should round-trip through `@raw`
18//! instead of a structured directive.
19//!
20//! The recorder does *not* attempt to resolve indirect call
21//! targets. A `call eax` may dispatch to a different function
22//! every time it executes; pinning the observed target into
23//! the static AST would actively miscode every other caller.
24//! What's recorded is "this byte was executed as the first
25//! byte of an instruction at least once", which is a property
26//! of the bytes themselves, not of the caller.
27
28use std::collections::{BTreeMap, BTreeSet};
29
30/// Per-address execution + write metadata harvested over the
31/// lifetime of one (or more) interpreter runs.
32///
33/// The map is always-on inside the [`Mmu`](crate::emulator::Mmu);
34/// callers that don't want coverage data simply ignore it.
35/// Cost is two hash operations per executed instruction and
36/// one per memory write, which is dwarfed by the rest of the
37/// interpreter.
38#[derive(Default, Clone)]
39pub struct CoverageMap {
40 /// Every guest address at which an instruction's first byte
41 /// was fetched. Sparse set — typical workloads execute a
42 /// few thousand to a few million unique addresses.
43 executed: BTreeSet<u32>,
44 /// Per-address memory writes. The value is the largest
45 /// access width seen at that address (1, 2, 4, or 8 bytes);
46 /// re-writes update the maximum so a later `mov [addr], al`
47 /// after a `mov [addr], rax` doesn't shrink the recorded
48 /// span.
49 writes: BTreeMap<u32, u8>,
50}
51
52impl CoverageMap {
53 /// Drop everything. Useful between runs when the caller
54 /// wants per-export coverage instead of cumulative.
55 pub fn clear(&mut self) {
56 self.executed.clear();
57 self.writes.clear();
58 }
59
60 /// Record an instruction fetch at `eip`. The first byte of
61 /// every dispatched instruction lands here exactly once per
62 /// occurrence — multiple hits at the same address share the
63 /// set entry. `insn_size` lets the analyzer infer per-
64 /// instruction coverage spans without re-decoding; today
65 /// only the first byte is recorded (size hint is reserved
66 /// for a future per-instruction-span variant).
67 pub fn record_exec(&mut self, eip: u32, _insn_size: u8) {
68 self.executed.insert(eip);
69 }
70
71 /// Record a guest memory write. `size` is the access width
72 /// in bytes (1/2/4/8). Each address tracks the *maximum*
73 /// width seen so the recorded write spans cover the
74 /// widest store that touched the byte.
75 pub fn record_write(&mut self, addr: u32, size: u32) {
76 let size_u8 = size.min(8).max(1) as u8;
77 for off in 0..size {
78 let target = addr.wrapping_add(off);
79 let slot = self.writes.entry(target).or_insert(size_u8);
80 if size_u8 > *slot {
81 *slot = size_u8;
82 }
83 }
84 }
85
86 /// Iterator over every executed address.
87 pub fn executed_addresses(&self) -> impl Iterator<Item = u32> + '_ {
88 self.executed.iter().copied()
89 }
90
91 /// Total number of distinct executed addresses.
92 pub fn executed_count(&self) -> usize {
93 self.executed.len()
94 }
95
96 /// Iterator over every written address. Re-yielding for
97 /// each byte covered by a wider write means a single
98 /// 4-byte `mov [addr], eax` produces 4 entries.
99 pub fn written_addresses(&self) -> impl Iterator<Item = u32> + '_ {
100 self.writes.keys().copied()
101 }
102
103 /// True when `addr` was both written and later executed
104 /// (in this map's lifetime). Detection is conservative —
105 /// the same address being written and executed *out of
106 /// order* still trips the check, since the static
107 /// decompiler treats either ordering as suspect.
108 pub fn is_self_modifying(&self, addr: u32) -> bool {
109 self.writes.contains_key(&addr) && self.executed.contains(&addr)
110 }
111
112 /// Every address that was both written and executed.
113 pub fn self_modifying_addresses(&self) -> impl Iterator<Item = u32> + '_ {
114 self.executed
115 .iter()
116 .copied()
117 .filter(move |a| self.writes.contains_key(a))
118 }
119
120 /// Collapse the executed-address set into contiguous
121 /// `[start, end)` ranges (end is exclusive). Two
122 /// consecutive addresses count as contiguous; gaps of any
123 /// size start a new range. Useful for spotting unaligned
124 /// code chunks the static function discovery missed.
125 pub fn executed_ranges(&self) -> Vec<std::ops::Range<u32>> {
126 let mut out = Vec::new();
127 let mut current: Option<std::ops::Range<u32>> = None;
128 for &addr in &self.executed {
129 match current.as_mut() {
130 Some(r) if r.end == addr => {
131 r.end = addr.wrapping_add(1);
132 }
133 _ => {
134 if let Some(r) = current.take() {
135 out.push(r);
136 }
137 current = Some(addr..addr.wrapping_add(1));
138 }
139 }
140 }
141 if let Some(r) = current {
142 out.push(r);
143 }
144 out
145 }
146}
147
148#[cfg(test)]
149mod tests {
150 use super::*;
151
152 #[test]
153 fn record_exec_dedupes() {
154 let mut c = CoverageMap::default();
155 c.record_exec(0x1000, 3);
156 c.record_exec(0x1000, 3);
157 c.record_exec(0x1003, 2);
158 assert_eq!(c.executed_count(), 2);
159 }
160
161 #[test]
162 fn record_write_expands_per_byte() {
163 let mut c = CoverageMap::default();
164 c.record_write(0x2000, 4);
165 assert_eq!(c.written_addresses().count(), 4);
166 }
167
168 #[test]
169 fn is_self_modifying_detects_write_then_exec() {
170 let mut c = CoverageMap::default();
171 c.record_write(0x3000, 4);
172 c.record_exec(0x3001, 1);
173 assert!(c.is_self_modifying(0x3001));
174 assert!(!c.is_self_modifying(0x4000));
175 }
176
177 #[test]
178 fn executed_ranges_merges_adjacent() {
179 let mut c = CoverageMap::default();
180 // 0x1000, 0x1001, 0x1002 — contiguous.
181 // 0x1005 — separate.
182 for a in &[0x1000u32, 0x1001, 0x1002, 0x1005] {
183 c.record_exec(*a, 1);
184 }
185 let ranges = c.executed_ranges();
186 assert_eq!(ranges.len(), 2);
187 assert_eq!(ranges[0], 0x1000..0x1003);
188 assert_eq!(ranges[1], 0x1005..0x1006);
189 }
190}