Skip to main content

ud_emulator/
coverage.rs

1//! Execution coverage tracker.
2//!
3//! Records every guest address at which an instruction was
4//! fetched and every guest address that was written by the
5//! interpreter, with the size of the access. Built into the
6//! [`Mmu`](crate::emulator::Mmu) so the recording happens at
7//! the same probe sites the trace feature uses, without
8//! requiring the `trace` feature to be enabled.
9//!
10//! The static decompiler consumes this map to:
11//!
12//! * Identify code at addresses the static function-discovery
13//!   pass missed (unaligned entry points, obfuscated trampolines).
14//! * Flag pages that were written by the running code and then
15//!   executed from — the classic unpacker / self-modifying-code
16//!   signature. Bytes in those regions are not the static
17//!   source-of-truth and should round-trip through `@raw`
18//!   instead of a structured directive.
19//!
20//! The recorder does *not* attempt to resolve indirect call
21//! targets. A `call eax` may dispatch to a different function
22//! every time it executes; pinning the observed target into
23//! the static AST would actively miscode every other caller.
24//! What's recorded is "this byte was executed as the first
25//! byte of an instruction at least once", which is a property
26//! of the bytes themselves, not of the caller.
27
28use std::collections::{BTreeMap, BTreeSet};
29
30/// Per-address execution + write metadata harvested over the
31/// lifetime of one (or more) interpreter runs.
32///
33/// The map is always-on inside the [`Mmu`](crate::emulator::Mmu);
34/// callers that don't want coverage data simply ignore it.
35/// Cost is two hash operations per executed instruction and
36/// one per memory write, which is dwarfed by the rest of the
37/// interpreter.
38#[derive(Default, Clone)]
39pub struct CoverageMap {
40    /// Every guest address at which an instruction's first byte
41    /// was fetched. Sparse set — typical workloads execute a
42    /// few thousand to a few million unique addresses.
43    executed: BTreeSet<u32>,
44    /// Per-address memory writes. The value is the largest
45    /// access width seen at that address (1, 2, 4, or 8 bytes);
46    /// re-writes update the maximum so a later `mov [addr], al`
47    /// after a `mov [addr], rax` doesn't shrink the recorded
48    /// span.
49    writes: BTreeMap<u32, u8>,
50}
51
52impl CoverageMap {
53    /// Drop everything. Useful between runs when the caller
54    /// wants per-export coverage instead of cumulative.
55    pub fn clear(&mut self) {
56        self.executed.clear();
57        self.writes.clear();
58    }
59
60    /// Record an instruction fetch at `eip`. The first byte of
61    /// every dispatched instruction lands here exactly once per
62    /// occurrence — multiple hits at the same address share the
63    /// set entry. `insn_size` lets the analyzer infer per-
64    /// instruction coverage spans without re-decoding; today
65    /// only the first byte is recorded (size hint is reserved
66    /// for a future per-instruction-span variant).
67    pub fn record_exec(&mut self, eip: u32, _insn_size: u8) {
68        self.executed.insert(eip);
69    }
70
71    /// Record a guest memory write. `size` is the access width
72    /// in bytes (1/2/4/8). Each address tracks the *maximum*
73    /// width seen so the recorded write spans cover the
74    /// widest store that touched the byte.
75    pub fn record_write(&mut self, addr: u32, size: u32) {
76        let size_u8 = size.min(8).max(1) as u8;
77        for off in 0..size {
78            let target = addr.wrapping_add(off);
79            let slot = self.writes.entry(target).or_insert(size_u8);
80            if size_u8 > *slot {
81                *slot = size_u8;
82            }
83        }
84    }
85
86    /// Iterator over every executed address.
87    pub fn executed_addresses(&self) -> impl Iterator<Item = u32> + '_ {
88        self.executed.iter().copied()
89    }
90
91    /// Total number of distinct executed addresses.
92    pub fn executed_count(&self) -> usize {
93        self.executed.len()
94    }
95
96    /// Iterator over every written address. Re-yielding for
97    /// each byte covered by a wider write means a single
98    /// 4-byte `mov [addr], eax` produces 4 entries.
99    pub fn written_addresses(&self) -> impl Iterator<Item = u32> + '_ {
100        self.writes.keys().copied()
101    }
102
103    /// True when `addr` was both written and later executed
104    /// (in this map's lifetime). Detection is conservative —
105    /// the same address being written and executed *out of
106    /// order* still trips the check, since the static
107    /// decompiler treats either ordering as suspect.
108    pub fn is_self_modifying(&self, addr: u32) -> bool {
109        self.writes.contains_key(&addr) && self.executed.contains(&addr)
110    }
111
112    /// Every address that was both written and executed.
113    pub fn self_modifying_addresses(&self) -> impl Iterator<Item = u32> + '_ {
114        self.executed
115            .iter()
116            .copied()
117            .filter(move |a| self.writes.contains_key(a))
118    }
119
120    /// Collapse the executed-address set into contiguous
121    /// `[start, end)` ranges (end is exclusive). Two
122    /// consecutive addresses count as contiguous; gaps of any
123    /// size start a new range. Useful for spotting unaligned
124    /// code chunks the static function discovery missed.
125    pub fn executed_ranges(&self) -> Vec<std::ops::Range<u32>> {
126        let mut out = Vec::new();
127        let mut current: Option<std::ops::Range<u32>> = None;
128        for &addr in &self.executed {
129            match current.as_mut() {
130                Some(r) if r.end == addr => {
131                    r.end = addr.wrapping_add(1);
132                }
133                _ => {
134                    if let Some(r) = current.take() {
135                        out.push(r);
136                    }
137                    current = Some(addr..addr.wrapping_add(1));
138                }
139            }
140        }
141        if let Some(r) = current {
142            out.push(r);
143        }
144        out
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use super::*;
151
152    #[test]
153    fn record_exec_dedupes() {
154        let mut c = CoverageMap::default();
155        c.record_exec(0x1000, 3);
156        c.record_exec(0x1000, 3);
157        c.record_exec(0x1003, 2);
158        assert_eq!(c.executed_count(), 2);
159    }
160
161    #[test]
162    fn record_write_expands_per_byte() {
163        let mut c = CoverageMap::default();
164        c.record_write(0x2000, 4);
165        assert_eq!(c.written_addresses().count(), 4);
166    }
167
168    #[test]
169    fn is_self_modifying_detects_write_then_exec() {
170        let mut c = CoverageMap::default();
171        c.record_write(0x3000, 4);
172        c.record_exec(0x3001, 1);
173        assert!(c.is_self_modifying(0x3001));
174        assert!(!c.is_self_modifying(0x4000));
175    }
176
177    #[test]
178    fn executed_ranges_merges_adjacent() {
179        let mut c = CoverageMap::default();
180        // 0x1000, 0x1001, 0x1002 — contiguous.
181        // 0x1005 — separate.
182        for a in &[0x1000u32, 0x1001, 0x1002, 0x1005] {
183            c.record_exec(*a, 1);
184        }
185        let ranges = c.executed_ranges();
186        assert_eq!(ranges.len(), 2);
187        assert_eq!(ranges[0], 0x1000..0x1003);
188        assert_eq!(ranges[1], 0x1005..0x1006);
189    }
190}