use memf_format::PhysicalMemoryProvider;
use crate::{ClassifiedString, StringEncoding};
const CHUNK_SIZE: usize = 64 * 1024;
pub struct ExtractConfig {
pub min_length: usize,
pub ascii: bool,
pub utf16le: bool,
}
impl Default for ExtractConfig {
fn default() -> Self {
Self {
min_length: 4,
ascii: true,
utf16le: true,
}
}
}
#[inline]
fn is_printable_ascii(b: u8) -> bool {
matches!(b, 0x20..=0x7E | b'\t' | b'\n' | b'\r')
}
#[inline]
fn is_printable_utf16(cp: u16) -> bool {
matches!(cp, 0x0020..=0x007E | 0x0009 | 0x000A | 0x000D)
}
pub fn extract_strings(
provider: &dyn PhysicalMemoryProvider,
config: &ExtractConfig,
) -> Vec<ClassifiedString> {
let mut results: Vec<ClassifiedString> = Vec::new();
for range in provider.ranges() {
let mut addr = range.start;
let mut ascii_carry: Vec<u8> = Vec::new();
let mut ascii_carry_offset: u64 = range.start;
let mut utf16_odd_byte: Option<(u8, u64)> = None;
while addr < range.end {
let chunk_len = CHUNK_SIZE.min((range.end - addr) as usize);
let mut buf = vec![0u8; chunk_len];
let n = provider.read_phys(addr, &mut buf).unwrap_or(0);
if n == 0 {
if ascii_carry.len() >= config.min_length && config.ascii {
emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
}
ascii_carry.clear();
utf16_odd_byte = None;
addr += chunk_len as u64;
continue;
}
let chunk = &buf[..n];
if config.ascii {
for (i, &b) in chunk.iter().enumerate() {
let phys = addr + i as u64;
if is_printable_ascii(b) {
if ascii_carry.is_empty() {
ascii_carry_offset = phys;
}
ascii_carry.push(b);
} else {
if ascii_carry.len() >= config.min_length {
emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
}
ascii_carry.clear();
}
}
}
if config.utf16le {
let (pairs, new_odd) = build_utf16_pairs(chunk, addr, utf16_odd_byte.take());
let mut run: Vec<char> = Vec::new();
let mut run_offset: u64 = 0;
for (cp, phys) in pairs {
if is_printable_utf16(cp) {
if run.is_empty() {
run_offset = phys;
}
run.push(cp as u8 as char);
} else {
if run.len() >= config.min_length {
emit_utf16(&run, run_offset, &mut results);
}
run.clear();
}
}
if run.len() >= config.min_length {
emit_utf16(&run, run_offset, &mut results);
}
utf16_odd_byte = new_odd;
}
addr += n as u64;
}
if config.ascii && ascii_carry.len() >= config.min_length {
emit_ascii(&ascii_carry, ascii_carry_offset, &mut results);
}
}
results
}
fn emit_ascii(run: &[u8], offset: u64, out: &mut Vec<ClassifiedString>) {
let value = String::from_utf8_lossy(run).into_owned();
out.push(ClassifiedString {
value,
physical_offset: offset,
encoding: StringEncoding::Ascii,
categories: vec![],
});
}
fn emit_utf16(run: &[char], offset: u64, out: &mut Vec<ClassifiedString>) {
let value: String = run.iter().collect();
out.push(ClassifiedString {
value,
physical_offset: offset,
encoding: StringEncoding::Utf16Le,
categories: vec![],
});
}
fn build_utf16_pairs(
chunk: &[u8],
chunk_base: u64,
odd: Option<(u8, u64)>,
) -> (Vec<(u16, u64)>, Option<(u8, u64)>) {
let mut pairs = Vec::new();
let mut i = if let Some((lo, addr)) = odd {
if chunk.is_empty() {
return (pairs, Some((lo, addr)));
}
let hi = chunk[0];
let cp = u16::from_le_bytes([lo, hi]);
pairs.push((cp, addr));
1usize
} else {
0usize
};
while i + 1 < chunk.len() {
let addr = chunk_base + i as u64;
let cp = u16::from_le_bytes([chunk[i], chunk[i + 1]]);
pairs.push((cp, addr));
i += 2;
}
let new_odd = if i < chunk.len() {
Some((chunk[i], chunk_base + i as u64))
} else {
None
};
(pairs, new_odd)
}
#[cfg(test)]
mod tests {
use memf_format::raw::RawProvider;
use super::*;
fn cfg_ascii_only(min: usize) -> ExtractConfig {
ExtractConfig {
min_length: min,
ascii: true,
utf16le: false,
}
}
fn cfg_utf16_only(min: usize) -> ExtractConfig {
ExtractConfig {
min_length: min,
ascii: false,
utf16le: true,
}
}
#[test]
fn extract_ascii_basic() {
let mut data = vec![0u8; 64];
data[0x08..0x0D].copy_from_slice(b"Hello");
data[0x20..0x25].copy_from_slice(b"World");
let provider = RawProvider::from_bytes(&data);
let cfg = cfg_ascii_only(4);
let strings = extract_strings(&provider, &cfg);
assert_eq!(
strings.len(),
2,
"expected exactly 2 strings, got {strings:?}"
);
let hello = strings
.iter()
.find(|s| s.value == "Hello")
.expect("Hello not found");
assert_eq!(hello.physical_offset, 0x08);
assert_eq!(hello.encoding, StringEncoding::Ascii);
let world = strings
.iter()
.find(|s| s.value == "World")
.expect("World not found");
assert_eq!(world.physical_offset, 0x20);
assert_eq!(world.encoding, StringEncoding::Ascii);
}
#[test]
fn min_length_filters_short_strings() {
let mut data = vec![0u8; 32];
data[0x00..0x02].copy_from_slice(b"Hi");
data[0x10..0x16].copy_from_slice(b"Longer");
let provider = RawProvider::from_bytes(&data);
let cfg = cfg_ascii_only(4);
let strings = extract_strings(&provider, &cfg);
assert!(
strings.iter().all(|s| s.value != "Hi"),
"\"Hi\" should be filtered out (len < min_length)"
);
assert!(
strings.iter().any(|s| s.value == "Longer"),
"\"Longer\" should be kept"
);
}
#[test]
fn extract_utf16le() {
let mut data = vec![0u8; 32];
let utf16_bytes: &[u8] = &[b'T', 0x00, b'e', 0x00, b's', 0x00, b't', 0x00];
let offset = 0x08usize;
data[offset..offset + utf16_bytes.len()].copy_from_slice(utf16_bytes);
let provider = RawProvider::from_bytes(&data);
let cfg = cfg_utf16_only(4);
let strings = extract_strings(&provider, &cfg);
let found = strings.iter().find(|s| s.value == "Test");
assert!(
found.is_some(),
"expected UTF-16LE \"Test\", got {strings:?}"
);
assert_eq!(found.unwrap().encoding, StringEncoding::Utf16Le);
assert_eq!(found.unwrap().physical_offset, offset as u64);
}
#[test]
fn empty_dump() {
let provider = RawProvider::from_bytes(&[]);
let cfg = ExtractConfig::default();
let strings = extract_strings(&provider, &cfg);
assert!(strings.is_empty(), "empty dump should yield no strings");
}
#[test]
fn extract_config_default_values() {
let cfg = ExtractConfig::default();
assert_eq!(cfg.min_length, 4);
assert!(cfg.ascii);
assert!(cfg.utf16le);
}
#[test]
fn cross_boundary_ascii_detection() {
let total_size = 65536 + 128;
let mut data = vec![0u8; total_size];
let start = 65536 - 5;
data[start..start + 10].copy_from_slice(b"ABCDEFGHIJ");
let provider = RawProvider::from_bytes(&data);
let cfg = cfg_ascii_only(4);
let strings = extract_strings(&provider, &cfg);
let found = strings.iter().find(|s| s.value.contains("ABCDE"));
assert!(
found.is_some(),
"expected cross-boundary string, got {:?}",
strings.iter().map(|s| &s.value).collect::<Vec<_>>()
);
let s = found.unwrap();
assert_eq!(s.value, "ABCDEFGHIJ");
assert_eq!(s.physical_offset, start as u64);
}
#[test]
fn ascii_only_mode_skips_utf16() {
let mut data = vec![0u8; 32];
data[0..8].copy_from_slice(&[b'T', 0x00, b'e', 0x00, b's', 0x00, b't', 0x00]);
let provider = RawProvider::from_bytes(&data);
let cfg = cfg_ascii_only(4);
let strings = extract_strings(&provider, &cfg);
assert!(
!strings
.iter()
.any(|s| s.value == "Test" && s.encoding == StringEncoding::Utf16Le),
"UTF-16 strings should not be extracted in ASCII-only mode"
);
}
#[test]
fn utf16_only_mode_skips_ascii() {
let mut data = vec![0u8; 32];
data[0..5].copy_from_slice(b"Hello");
let provider = RawProvider::from_bytes(&data);
let cfg = cfg_utf16_only(4);
let strings = extract_strings(&provider, &cfg);
assert!(
!strings
.iter()
.any(|s| s.value == "Hello" && s.encoding == StringEncoding::Ascii),
"ASCII strings should not be extracted in UTF-16-only mode"
);
}
#[test]
fn mixed_ascii_and_utf16le_in_same_dump() {
let mut data = vec![0u8; 128];
data[0x00..0x05].copy_from_slice(b"ASCII");
let utf16: Vec<u8> = "HI!".encode_utf16().flat_map(u16::to_le_bytes).collect();
data[0x40..0x40 + utf16.len()].copy_from_slice(&utf16);
let provider = RawProvider::from_bytes(&data);
let cfg = ExtractConfig {
min_length: 3,
ascii: true,
utf16le: true,
};
let strings = extract_strings(&provider, &cfg);
let ascii_found = strings
.iter()
.any(|s| s.value == "ASCII" && s.encoding == StringEncoding::Ascii);
let utf16_found = strings
.iter()
.any(|s| s.value == "HI!" && s.encoding == StringEncoding::Utf16Le);
assert!(ascii_found, "ASCII string must be found in mixed dump");
assert!(utf16_found, "UTF-16LE string must be found in mixed dump");
}
#[test]
fn utf16le_surrogate_pairs_are_skipped() {
let mut data = vec![0u8; 64];
data[0x00..0x02].copy_from_slice(&[0x00, 0xD8]);
let ok_utf16: Vec<u8> = "OKAY"
.encode_utf16()
.flat_map(u16::to_le_bytes)
.collect();
data[0x10..0x10 + ok_utf16.len()].copy_from_slice(&ok_utf16);
let provider = RawProvider::from_bytes(&data);
let cfg = ExtractConfig {
min_length: 4,
ascii: false,
utf16le: true,
};
let strings = extract_strings(&provider, &cfg);
let ok_found = strings.iter().any(|s| s.value == "OKAY");
assert!(
ok_found,
"valid UTF-16LE string after surrogate must still be found"
);
}
}