Skip to main content

memf_strings/
extract.rs

1//! String extraction from physical memory.
2
3use memf_format::PhysicalMemoryProvider;
4
5use crate::{ClassifiedString, StringEncoding};
6
7const CHUNK_SIZE: usize = 64 * 1024; // 64 KB
8
9/// Configuration for string extraction.
10pub struct ExtractConfig {
11    /// Minimum number of characters for a string to be emitted. Default: 4.
12    pub min_length: usize,
13    /// Extract ASCII strings (printable bytes 0x20–0x7E plus \t \n \r). Default: true.
14    pub ascii: bool,
15    /// Extract UTF-16LE strings. Default: true.
16    pub utf16le: bool,
17}
18
19impl Default for ExtractConfig {
20    fn default() -> Self {
21        Self {
22            min_length: 4,
23            ascii: true,
24            utf16le: true,
25        }
26    }
27}
28
29/// Returns `true` if the byte is a printable ASCII character or a common
30/// whitespace character (\t, \n, \r).
31#[inline]
32fn is_printable_ascii(b: u8) -> bool {
33    matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r')
34}
35
36/// Returns `true` if the UTF-16 code unit represents a printable ASCII
37/// character (same range, mapped to u16).
38#[inline]
39fn is_printable_utf16(cp: u16) -> bool {
40    matches!(cp, 0x0020..=0x007E | 0x0009 | 0x000A | 0x000D)
41}
42
43/// Extract strings from a physical memory provider.
44///
45/// Reads 64 KB chunks from each physical range, scanning for ASCII and/or
46/// UTF-16LE printable sequences of at least `config.min_length` characters.
47/// The returned `ClassifiedString` values have empty `categories` — callers
48/// should run a classifier pass afterwards.
49pub fn extract_strings(
50    provider: &dyn PhysicalMemoryProvider,
51    config: &ExtractConfig,
52) -> Vec<ClassifiedString> {
53    let mut results: Vec<ClassifiedString> = Vec::new();
54
55    for range in provider.ranges() {
56        let mut addr = range.start;
57
58        // carry-over buffer for ASCII across chunk boundaries
59        let mut ascii_carry: Vec<u8> = Vec::new();
60        let mut ascii_carry_offset: u64 = range.start;
61
62        // carry-over for UTF-16LE (we may have an odd byte left from previous chunk)
63        let mut utf16_odd_byte: Option<(u8, u64)> = None;
64
65        while addr < range.end {
66            let chunk_len = CHUNK_SIZE.min((range.end - addr) as usize);
67            let mut buf = vec![0u8; chunk_len];
68            let n = provider.read_phys(addr, &mut buf).unwrap_or(0);
69            if n == 0 {
70                if ascii_carry.len() >= config.min_length && config.ascii {
71                    emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
72                }
73                ascii_carry.clear();
74                utf16_odd_byte = None;
75                addr += chunk_len as u64;
76                continue;
77            }
78            let chunk = &buf[..n];
79
80            // ── ASCII pass ────────────────────────────────────────────────
81            if config.ascii {
82                for (i, &b) in chunk.iter().enumerate() {
83                    let phys = addr + i as u64;
84                    if is_printable_ascii(b) {
85                        if ascii_carry.is_empty() {
86                            ascii_carry_offset = phys;
87                        }
88                        ascii_carry.push(b);
89                    } else {
90                        if ascii_carry.len() >= config.min_length {
91                            emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
92                        }
93                        ascii_carry.clear();
94                    }
95                }
96            }
97
98            // ── UTF-16LE pass ─────────────────────────────────────────────
99            if config.utf16le {
100                let (pairs, new_odd) = build_utf16_pairs(chunk, addr, utf16_odd_byte.take());
101
102                let mut run: Vec<char> = Vec::new();
103                let mut run_offset: u64 = 0;
104
105                for (cp, phys) in pairs {
106                    if is_printable_utf16(cp) {
107                        if run.is_empty() {
108                            run_offset = phys;
109                        }
110                        run.push(cp as u8 as char);
111                    } else {
112                        if run.len() >= config.min_length {
113                            emit_utf16(&run, run_offset, &mut results);
114                        }
115                        run.clear();
116                    }
117                }
118                if run.len() >= config.min_length {
119                    emit_utf16(&run, run_offset, &mut results);
120                }
121                utf16_odd_byte = new_odd;
122            }
123
124            addr += n as u64;
125        }
126
127        // ── End-of-range flushes ──────────────────────────────────────────
128        if config.ascii && ascii_carry.len() >= config.min_length {
129            emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
130        }
131    }
132
133    results
134}
135
136// ── helpers ──────────────────────────────────────────────────────────────────
137
138fn emit_ascii(run: &[u8], offset: u64, out: &mut Vec<ClassifiedString>) {
139    let value = String::from_utf8_lossy(run).into_owned();
140    out.push(ClassifiedString {
141        value,
142        physical_offset: offset,
143        encoding: StringEncoding::Ascii,
144        categories: vec![],
145    });
146}
147
148fn emit_utf16(run: &[char], offset: u64, out: &mut Vec<ClassifiedString>) {
149    let value: String = run.iter().collect();
150    out.push(ClassifiedString {
151        value,
152        physical_offset: offset,
153        encoding: StringEncoding::Utf16Le,
154        categories: vec![],
155    });
156}
157
158/// Pair up bytes into (u16 code-unit, physical_address) tuples, handling an
159/// optional leftover byte from the previous chunk.
160fn build_utf16_pairs(
161    chunk: &[u8],
162    chunk_base: u64,
163    odd: Option<(u8, u64)>,
164) -> (Vec<(u16, u64)>, Option<(u8, u64)>) {
165    let mut pairs = Vec::new();
166
167    let mut i = if let Some((lo, addr)) = odd {
168        if chunk.is_empty() {
169            return (pairs, Some((lo, addr)));
170        }
171        let hi = chunk[0];
172        let cp = u16::from_le_bytes([lo, hi]);
173        pairs.push((cp, addr));
174        1usize
175    } else {
176        0usize
177    };
178
179    while i + 1 < chunk.len() {
180        let addr = chunk_base + i as u64;
181        let cp = u16::from_le_bytes([chunk[i], chunk[i + 1]]);
182        pairs.push((cp, addr));
183        i += 2;
184    }
185
186    let new_odd = if i < chunk.len() {
187        Some((chunk[i], chunk_base + i as u64))
188    } else {
189        None
190    };
191
192    (pairs, new_odd)
193}
194
195// ─────────────────────────────────────────────────────────────────────────────
196#[cfg(test)]
197mod tests {
198    use memf_format::raw::RawProvider;
199
200    use super::*;
201
202    // ── helpers ──────────────────────────────────────────────────────────────
203
204    fn cfg_ascii_only(min: usize) -> ExtractConfig {
205        ExtractConfig {
206            min_length: min,
207            ascii: true,
208            utf16le: false,
209        }
210    }
211
212    fn cfg_utf16_only(min: usize) -> ExtractConfig {
213        ExtractConfig {
214            min_length: min,
215            ascii: false,
216            utf16le: true,
217        }
218    }
219
220    // ── Test 1: basic ASCII extraction ────────────────────────────────────────
221
222    #[test]
223    fn extract_ascii_basic() {
224        // Build a 64-byte buffer: zeros everywhere except two embedded strings.
225        //   offset 0x08: "Hello" (5 bytes)
226        //   offset 0x20: "World" (5 bytes)
227        let mut data = vec![0u8; 64];
228        data[0x08..0x0D].copy_from_slice(b"Hello");
229        data[0x20..0x25].copy_from_slice(b"World");
230
231        let provider = RawProvider::from_bytes(&data);
232        let cfg = cfg_ascii_only(4);
233        let strings = extract_strings(&provider, &cfg);
234
235        assert_eq!(
236            strings.len(),
237            2,
238            "expected exactly 2 strings, got {strings:?}"
239        );
240
241        let hello = strings
242            .iter()
243            .find(|s| s.value == "Hello")
244            .expect("Hello not found");
245        assert_eq!(hello.physical_offset, 0x08);
246        assert_eq!(hello.encoding, StringEncoding::Ascii);
247
248        let world = strings
249            .iter()
250            .find(|s| s.value == "World")
251            .expect("World not found");
252        assert_eq!(world.physical_offset, 0x20);
253        assert_eq!(world.encoding, StringEncoding::Ascii);
254    }
255
256    // ── Test 2: min_length filter ─────────────────────────────────────────────
257
258    #[test]
259    fn min_length_filters_short_strings() {
260        // "Hi" is 2 chars -> filtered; "Longer" is 6 chars -> kept.
261        let mut data = vec![0u8; 32];
262        data[0x00..0x02].copy_from_slice(b"Hi");
263        data[0x10..0x16].copy_from_slice(b"Longer");
264
265        let provider = RawProvider::from_bytes(&data);
266        let cfg = cfg_ascii_only(4);
267        let strings = extract_strings(&provider, &cfg);
268
269        assert!(
270            strings.iter().all(|s| s.value != "Hi"),
271            "\"Hi\" should be filtered out (len < min_length)"
272        );
273        assert!(
274            strings.iter().any(|s| s.value == "Longer"),
275            "\"Longer\" should be kept"
276        );
277    }
278
279    // ── Test 3: UTF-16LE extraction ───────────────────────────────────────────
280
281    #[test]
282    fn extract_utf16le() {
283        // Encode "Test" as UTF-16LE: T\0 e\0 s\0 t\0 = 8 bytes
284        let mut data = vec![0u8; 32];
285        let utf16_bytes: &[u8] = &[b'T', 0x00, b'e', 0x00, b's', 0x00, b't', 0x00];
286        let offset = 0x08usize;
287        data[offset..offset + utf16_bytes.len()].copy_from_slice(utf16_bytes);
288
289        let provider = RawProvider::from_bytes(&data);
290        let cfg = cfg_utf16_only(4);
291        let strings = extract_strings(&provider, &cfg);
292
293        let found = strings.iter().find(|s| s.value == "Test");
294        assert!(
295            found.is_some(),
296            "expected UTF-16LE \"Test\", got {strings:?}"
297        );
298        assert_eq!(found.unwrap().encoding, StringEncoding::Utf16Le);
299        assert_eq!(found.unwrap().physical_offset, offset as u64);
300    }
301
302    // ── Test 4: empty provider produces no strings ────────────────────────────
303
304    #[test]
305    fn empty_dump() {
306        let provider = RawProvider::from_bytes(&[]);
307        let cfg = ExtractConfig::default();
308        let strings = extract_strings(&provider, &cfg);
309        assert!(strings.is_empty(), "empty dump should yield no strings");
310    }
311
312    #[test]
313    fn extract_config_default_values() {
314        let cfg = ExtractConfig::default();
315        assert_eq!(cfg.min_length, 4);
316        assert!(cfg.ascii);
317        assert!(cfg.utf16le);
318    }
319
320    #[test]
321    fn cross_boundary_ascii_detection() {
322        // Build a buffer where a string spans the 64KB chunk boundary.
323        // CHUNK_SIZE is 64 * 1024 = 65536.
324        let total_size = 65536 + 128;
325        let mut data = vec![0u8; total_size];
326        // Place "ABCDEFGHIJ" (10 chars) starting 5 bytes before the 64K boundary
327        let start = 65536 - 5;
328        data[start..start + 10].copy_from_slice(b"ABCDEFGHIJ");
329
330        let provider = RawProvider::from_bytes(&data);
331        let cfg = cfg_ascii_only(4);
332        let strings = extract_strings(&provider, &cfg);
333
334        // The carry mechanism should produce "ABCDEFGHIJ" as a single string
335        let found = strings.iter().find(|s| s.value.contains("ABCDE"));
336        assert!(
337            found.is_some(),
338            "expected cross-boundary string, got {:?}",
339            strings.iter().map(|s| &s.value).collect::<Vec<_>>()
340        );
341        let s = found.unwrap();
342        assert_eq!(s.value, "ABCDEFGHIJ");
343        assert_eq!(s.physical_offset, start as u64);
344    }
345
346    #[test]
347    fn ascii_only_mode_skips_utf16() {
348        // Build UTF-16LE "Test" but only enable ASCII mode
349        let mut data = vec![0u8; 32];
350        data[0..8].copy_from_slice(&[b'T', 0x00, b'e', 0x00, b's', 0x00, b't', 0x00]);
351
352        let provider = RawProvider::from_bytes(&data);
353        let cfg = cfg_ascii_only(4);
354        let strings = extract_strings(&provider, &cfg);
355
356        // Should NOT find "Test" as a UTF-16 string
357        assert!(
358            !strings
359                .iter()
360                .any(|s| s.value == "Test" && s.encoding == StringEncoding::Utf16Le),
361            "UTF-16 strings should not be extracted in ASCII-only mode"
362        );
363    }
364
365    #[test]
366    fn utf16_only_mode_skips_ascii() {
367        let mut data = vec![0u8; 32];
368        data[0..5].copy_from_slice(b"Hello");
369
370        let provider = RawProvider::from_bytes(&data);
371        let cfg = cfg_utf16_only(4);
372        let strings = extract_strings(&provider, &cfg);
373
374        // Should NOT find "Hello" as an ASCII string
375        assert!(
376            !strings
377                .iter()
378                .any(|s| s.value == "Hello" && s.encoding == StringEncoding::Ascii),
379            "ASCII strings should not be extracted in UTF-16-only mode"
380        );
381    }
382
383    /// Both ASCII and UTF-16LE strings coexist in the same dump — both must be found.
384    #[test]
385    fn mixed_ascii_and_utf16le_in_same_dump() {
386        let mut data = vec![0u8; 128];
387        // ASCII string at offset 0x00
388        data[0x00..0x05].copy_from_slice(b"ASCII");
389        // UTF-16LE string "HI" at offset 0x40 (each char is 2 bytes, null-separated)
390        let utf16: Vec<u8> = "HI!".encode_utf16().flat_map(u16::to_le_bytes).collect();
391        data[0x40..0x40 + utf16.len()].copy_from_slice(&utf16);
392
393        let provider = RawProvider::from_bytes(&data);
394        let cfg = ExtractConfig {
395            min_length: 3,
396            ascii: true,
397            utf16le: true,
398        };
399        let strings = extract_strings(&provider, &cfg);
400
401        let ascii_found = strings
402            .iter()
403            .any(|s| s.value == "ASCII" && s.encoding == StringEncoding::Ascii);
404        let utf16_found = strings
405            .iter()
406            .any(|s| s.value == "HI!" && s.encoding == StringEncoding::Utf16Le);
407        assert!(ascii_found, "ASCII string must be found in mixed dump");
408        assert!(utf16_found, "UTF-16LE string must be found in mixed dump");
409    }
410
411    /// UTF-16LE surrogate code units (0xD800–0xDFFF) must be silently skipped,
412    /// not panic or produce garbled output.
413    #[test]
414    fn utf16le_surrogate_pairs_are_skipped() {
415        let mut data = vec![0u8; 64];
416        // Embed a lone surrogate (0xD800) followed by a valid string "OK"
417        // Surrogate at offset 0: [0x00, 0xD8]
418        data[0x00..0x02].copy_from_slice(&[0x00, 0xD8]);
419        // "OK\0" as UTF-16LE at offset 0x10
420        let ok_utf16: Vec<u8> = "OKAY"
421            .encode_utf16()
422            .flat_map(u16::to_le_bytes)
423            .collect();
424        data[0x10..0x10 + ok_utf16.len()].copy_from_slice(&ok_utf16);
425
426        let provider = RawProvider::from_bytes(&data);
427        let cfg = ExtractConfig {
428            min_length: 4,
429            ascii: false,
430            utf16le: true,
431        };
432        let strings = extract_strings(&provider, &cfg);
433
434        // Must not panic; all extracted strings must be valid Rust strings (guaranteed
435        // by the type system — surrogates cannot be represented in `char`/`String`).
436        // Verify the valid string after the surrogate is still recovered.
437        let ok_found = strings.iter().any(|s| s.value == "OKAY");
438        assert!(
439            ok_found,
440            "valid UTF-16LE string after surrogate must still be found"
441        );
442    }
443}