1use memf_format::PhysicalMemoryProvider;
4
5use crate::{ClassifiedString, StringEncoding};
6
7const CHUNK_SIZE: usize = 64 * 1024; pub struct ExtractConfig {
11 pub min_length: usize,
13 pub ascii: bool,
15 pub utf16le: bool,
17}
18
19impl Default for ExtractConfig {
20 fn default() -> Self {
21 Self {
22 min_length: 4,
23 ascii: true,
24 utf16le: true,
25 }
26 }
27}
28
29#[inline]
32fn is_printable_ascii(b: u8) -> bool {
33 matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r')
34}
35
36#[inline]
39fn is_printable_utf16(cp: u16) -> bool {
40 matches!(cp, 0x0020..=0x007E | 0x0009 | 0x000A | 0x000D)
41}
42
43pub fn extract_strings(
50 provider: &dyn PhysicalMemoryProvider,
51 config: &ExtractConfig,
52) -> Vec<ClassifiedString> {
53 let mut results: Vec<ClassifiedString> = Vec::new();
54
55 for range in provider.ranges() {
56 let mut addr = range.start;
57
58 let mut ascii_carry: Vec<u8> = Vec::new();
60 let mut ascii_carry_offset: u64 = range.start;
61
62 let mut utf16_odd_byte: Option<(u8, u64)> = None;
64
65 while addr < range.end {
66 let chunk_len = CHUNK_SIZE.min((range.end - addr) as usize);
67 let mut buf = vec![0u8; chunk_len];
68 let n = provider.read_phys(addr, &mut buf).unwrap_or(0);
69 if n == 0 {
70 if ascii_carry.len() >= config.min_length && config.ascii {
71 emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
72 }
73 ascii_carry.clear();
74 utf16_odd_byte = None;
75 addr += chunk_len as u64;
76 continue;
77 }
78 let chunk = &buf[..n];
79
80 if config.ascii {
82 for (i, &b) in chunk.iter().enumerate() {
83 let phys = addr + i as u64;
84 if is_printable_ascii(b) {
85 if ascii_carry.is_empty() {
86 ascii_carry_offset = phys;
87 }
88 ascii_carry.push(b);
89 } else {
90 if ascii_carry.len() >= config.min_length {
91 emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
92 }
93 ascii_carry.clear();
94 }
95 }
96 }
97
98 if config.utf16le {
100 let (pairs, new_odd) = build_utf16_pairs(chunk, addr, utf16_odd_byte.take());
101
102 let mut run: Vec<char> = Vec::new();
103 let mut run_offset: u64 = 0;
104
105 for (cp, phys) in pairs {
106 if is_printable_utf16(cp) {
107 if run.is_empty() {
108 run_offset = phys;
109 }
110 run.push(cp as u8 as char);
111 } else {
112 if run.len() >= config.min_length {
113 emit_utf16(&run, run_offset, &mut results);
114 }
115 run.clear();
116 }
117 }
118 if run.len() >= config.min_length {
119 emit_utf16(&run, run_offset, &mut results);
120 }
121 utf16_odd_byte = new_odd;
122 }
123
124 addr += n as u64;
125 }
126
127 if config.ascii && ascii_carry.len() >= config.min_length {
129 emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
130 }
131 }
132
133 results
134}
135
136fn emit_ascii(run: &[u8], offset: u64, out: &mut Vec<ClassifiedString>) {
139 let value = String::from_utf8_lossy(run).into_owned();
140 out.push(ClassifiedString {
141 value,
142 physical_offset: offset,
143 encoding: StringEncoding::Ascii,
144 categories: vec![],
145 });
146}
147
148fn emit_utf16(run: &[char], offset: u64, out: &mut Vec<ClassifiedString>) {
149 let value: String = run.iter().collect();
150 out.push(ClassifiedString {
151 value,
152 physical_offset: offset,
153 encoding: StringEncoding::Utf16Le,
154 categories: vec![],
155 });
156}
157
158fn build_utf16_pairs(
161 chunk: &[u8],
162 chunk_base: u64,
163 odd: Option<(u8, u64)>,
164) -> (Vec<(u16, u64)>, Option<(u8, u64)>) {
165 let mut pairs = Vec::new();
166
167 let mut i = if let Some((lo, addr)) = odd {
168 if chunk.is_empty() {
169 return (pairs, Some((lo, addr)));
170 }
171 let hi = chunk[0];
172 let cp = u16::from_le_bytes([lo, hi]);
173 pairs.push((cp, addr));
174 1usize
175 } else {
176 0usize
177 };
178
179 while i + 1 < chunk.len() {
180 let addr = chunk_base + i as u64;
181 let cp = u16::from_le_bytes([chunk[i], chunk[i + 1]]);
182 pairs.push((cp, addr));
183 i += 2;
184 }
185
186 let new_odd = if i < chunk.len() {
187 Some((chunk[i], chunk_base + i as u64))
188 } else {
189 None
190 };
191
192 (pairs, new_odd)
193}
194
195#[cfg(test)]
197mod tests {
198 use memf_format::raw::RawProvider;
199
200 use super::*;
201
202 fn cfg_ascii_only(min: usize) -> ExtractConfig {
205 ExtractConfig {
206 min_length: min,
207 ascii: true,
208 utf16le: false,
209 }
210 }
211
212 fn cfg_utf16_only(min: usize) -> ExtractConfig {
213 ExtractConfig {
214 min_length: min,
215 ascii: false,
216 utf16le: true,
217 }
218 }
219
220 #[test]
223 fn extract_ascii_basic() {
224 let mut data = vec![0u8; 64];
228 data[0x08..0x0D].copy_from_slice(b"Hello");
229 data[0x20..0x25].copy_from_slice(b"World");
230
231 let provider = RawProvider::from_bytes(&data);
232 let cfg = cfg_ascii_only(4);
233 let strings = extract_strings(&provider, &cfg);
234
235 assert_eq!(
236 strings.len(),
237 2,
238 "expected exactly 2 strings, got {strings:?}"
239 );
240
241 let hello = strings
242 .iter()
243 .find(|s| s.value == "Hello")
244 .expect("Hello not found");
245 assert_eq!(hello.physical_offset, 0x08);
246 assert_eq!(hello.encoding, StringEncoding::Ascii);
247
248 let world = strings
249 .iter()
250 .find(|s| s.value == "World")
251 .expect("World not found");
252 assert_eq!(world.physical_offset, 0x20);
253 assert_eq!(world.encoding, StringEncoding::Ascii);
254 }
255
256 #[test]
259 fn min_length_filters_short_strings() {
260 let mut data = vec![0u8; 32];
262 data[0x00..0x02].copy_from_slice(b"Hi");
263 data[0x10..0x16].copy_from_slice(b"Longer");
264
265 let provider = RawProvider::from_bytes(&data);
266 let cfg = cfg_ascii_only(4);
267 let strings = extract_strings(&provider, &cfg);
268
269 assert!(
270 strings.iter().all(|s| s.value != "Hi"),
271 "\"Hi\" should be filtered out (len < min_length)"
272 );
273 assert!(
274 strings.iter().any(|s| s.value == "Longer"),
275 "\"Longer\" should be kept"
276 );
277 }
278
279 #[test]
282 fn extract_utf16le() {
283 let mut data = vec![0u8; 32];
285 let utf16_bytes: &[u8] = &[b'T', 0x00, b'e', 0x00, b's', 0x00, b't', 0x00];
286 let offset = 0x08usize;
287 data[offset..offset + utf16_bytes.len()].copy_from_slice(utf16_bytes);
288
289 let provider = RawProvider::from_bytes(&data);
290 let cfg = cfg_utf16_only(4);
291 let strings = extract_strings(&provider, &cfg);
292
293 let found = strings.iter().find(|s| s.value == "Test");
294 assert!(
295 found.is_some(),
296 "expected UTF-16LE \"Test\", got {strings:?}"
297 );
298 assert_eq!(found.unwrap().encoding, StringEncoding::Utf16Le);
299 assert_eq!(found.unwrap().physical_offset, offset as u64);
300 }
301
302 #[test]
305 fn empty_dump() {
306 let provider = RawProvider::from_bytes(&[]);
307 let cfg = ExtractConfig::default();
308 let strings = extract_strings(&provider, &cfg);
309 assert!(strings.is_empty(), "empty dump should yield no strings");
310 }
311
312 #[test]
313 fn extract_config_default_values() {
314 let cfg = ExtractConfig::default();
315 assert_eq!(cfg.min_length, 4);
316 assert!(cfg.ascii);
317 assert!(cfg.utf16le);
318 }
319
320 #[test]
321 fn cross_boundary_ascii_detection() {
322 let total_size = 65536 + 128;
325 let mut data = vec![0u8; total_size];
326 let start = 65536 - 5;
328 data[start..start + 10].copy_from_slice(b"ABCDEFGHIJ");
329
330 let provider = RawProvider::from_bytes(&data);
331 let cfg = cfg_ascii_only(4);
332 let strings = extract_strings(&provider, &cfg);
333
334 let found = strings.iter().find(|s| s.value.contains("ABCDE"));
336 assert!(
337 found.is_some(),
338 "expected cross-boundary string, got {:?}",
339 strings.iter().map(|s| &s.value).collect::<Vec<_>>()
340 );
341 let s = found.unwrap();
342 assert_eq!(s.value, "ABCDEFGHIJ");
343 assert_eq!(s.physical_offset, start as u64);
344 }
345
346 #[test]
347 fn ascii_only_mode_skips_utf16() {
348 let mut data = vec![0u8; 32];
350 data[0..8].copy_from_slice(&[b'T', 0x00, b'e', 0x00, b's', 0x00, b't', 0x00]);
351
352 let provider = RawProvider::from_bytes(&data);
353 let cfg = cfg_ascii_only(4);
354 let strings = extract_strings(&provider, &cfg);
355
356 assert!(
358 !strings
359 .iter()
360 .any(|s| s.value == "Test" && s.encoding == StringEncoding::Utf16Le),
361 "UTF-16 strings should not be extracted in ASCII-only mode"
362 );
363 }
364
365 #[test]
366 fn utf16_only_mode_skips_ascii() {
367 let mut data = vec![0u8; 32];
368 data[0..5].copy_from_slice(b"Hello");
369
370 let provider = RawProvider::from_bytes(&data);
371 let cfg = cfg_utf16_only(4);
372 let strings = extract_strings(&provider, &cfg);
373
374 assert!(
376 !strings
377 .iter()
378 .any(|s| s.value == "Hello" && s.encoding == StringEncoding::Ascii),
379 "ASCII strings should not be extracted in UTF-16-only mode"
380 );
381 }
382
383 #[test]
385 fn mixed_ascii_and_utf16le_in_same_dump() {
386 let mut data = vec![0u8; 128];
387 data[0x00..0x05].copy_from_slice(b"ASCII");
389 let utf16: Vec<u8> = "HI!".encode_utf16().flat_map(u16::to_le_bytes).collect();
391 data[0x40..0x40 + utf16.len()].copy_from_slice(&utf16);
392
393 let provider = RawProvider::from_bytes(&data);
394 let cfg = ExtractConfig {
395 min_length: 3,
396 ascii: true,
397 utf16le: true,
398 };
399 let strings = extract_strings(&provider, &cfg);
400
401 let ascii_found = strings
402 .iter()
403 .any(|s| s.value == "ASCII" && s.encoding == StringEncoding::Ascii);
404 let utf16_found = strings
405 .iter()
406 .any(|s| s.value == "HI!" && s.encoding == StringEncoding::Utf16Le);
407 assert!(ascii_found, "ASCII string must be found in mixed dump");
408 assert!(utf16_found, "UTF-16LE string must be found in mixed dump");
409 }
410
411 #[test]
414 fn utf16le_surrogate_pairs_are_skipped() {
415 let mut data = vec![0u8; 64];
416 data[0x00..0x02].copy_from_slice(&[0x00, 0xD8]);
419 let ok_utf16: Vec<u8> = "OKAY"
421 .encode_utf16()
422 .flat_map(u16::to_le_bytes)
423 .collect();
424 data[0x10..0x10 + ok_utf16.len()].copy_from_slice(&ok_utf16);
425
426 let provider = RawProvider::from_bytes(&data);
427 let cfg = ExtractConfig {
428 min_length: 4,
429 ascii: false,
430 utf16le: true,
431 };
432 let strings = extract_strings(&provider, &cfg);
433
434 let ok_found = strings.iter().any(|s| s.value == "OKAY");
438 assert!(
439 ok_found,
440 "valid UTF-16LE string after surrogate must still be found"
441 );
442 }
443}