use std::collections::HashSet;
use lo_core::{LoError, Result};
const CFB_SIGNATURE: &[u8; 8] = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1";
const FREE_SECTOR: u32 = 0xFFFF_FFFF;
const END_OF_CHAIN: u32 = 0xFFFF_FFFE;
const FAT_SECTOR: u32 = 0xFFFF_FFFD;
const DIFAT_SECTOR: u32 = 0xFFFF_FFFC;
#[derive(Clone, Debug)]
struct DirectoryEntry {
name: String,
object_type: u8,
starting_sector: u32,
stream_size: usize,
}
#[derive(Clone, Debug)]
struct CompoundFile {
bytes: Vec<u8>,
sector_size: usize,
mini_sector_size: usize,
mini_stream_cutoff: usize,
fat: Vec<u32>,
mini_fat: Vec<u32>,
directory: Vec<DirectoryEntry>,
mini_stream: Vec<u8>,
}
#[derive(Clone, Debug)]
struct Piece {
cp_start: usize,
cp_end: usize,
fc: u32,
compressed: bool,
}
pub fn extract_text_from_doc(bytes: &[u8]) -> Result<String> {
let compound = CompoundFile::open(bytes)?;
let word_document = compound.read_stream("WordDocument")?;
if word_document.len() < 0x20 {
return Err(LoError::Parse(
"DOC WordDocument stream is too small".to_string(),
));
}
let fib_flags = read_u16(&word_document, 0x0A)?;
if (fib_flags & (1 << 8)) != 0 {
return Err(LoError::Unsupported(
"encrypted legacy DOC files are not supported by the pure-Rust parser".to_string(),
));
}
let use_1table = (fib_flags & (1 << 9)) != 0;
let table_stream = if use_1table { "1Table" } else { "0Table" };
let table = compound.read_stream(table_stream)?;
let (fc_clx, lcb_clx) = locate_clx(&word_document, &table)?;
let end = fc_clx
.checked_add(lcb_clx)
.ok_or_else(|| LoError::Parse("DOC CLX overflow".to_string()))?;
if end > table.len() {
return Err(LoError::Parse(
"DOC CLX extends past the table stream".to_string(),
));
}
let pieces = parse_clx(&table[fc_clx..end])?;
let raw_text = extract_piece_text(&word_document, &pieces)?;
Ok(normalize_doc_text(&raw_text))
}
impl CompoundFile {
fn open(bytes: &[u8]) -> Result<Self> {
if bytes.len() < 512 {
return Err(LoError::Parse(
"CFB file is smaller than its 512-byte header".to_string(),
));
}
if &bytes[..8] != CFB_SIGNATURE {
return Err(LoError::Parse(
"not a Compound File Binary document".to_string(),
));
}
let major_version = read_u16(bytes, 0x1A)?;
let sector_shift = read_u16(bytes, 0x1E)?;
let mini_sector_shift = read_u16(bytes, 0x20)?;
let sector_size = 1usize
.checked_shl(sector_shift as u32)
.ok_or_else(|| LoError::Parse("invalid CFB sector shift".to_string()))?;
let mini_sector_size = 1usize
.checked_shl(mini_sector_shift as u32)
.ok_or_else(|| LoError::Parse("invalid CFB mini sector shift".to_string()))?;
let num_fat_sectors = read_u32(bytes, 0x2C)? as usize;
let first_dir_sector = read_u32(bytes, 0x30)?;
let mini_stream_cutoff = read_u32(bytes, 0x38)? as usize;
let first_mini_fat_sector = read_u32(bytes, 0x3C)?;
let num_mini_fat_sectors = read_u32(bytes, 0x40)? as usize;
let first_difat_sector = read_u32(bytes, 0x44)?;
let num_difat_sectors = read_u32(bytes, 0x48)? as usize;
let mut difat = Vec::new();
for index in 0..109usize {
let sector = read_u32(bytes, 0x4C + index * 4)?;
if sector != FREE_SECTOR {
difat.push(sector);
}
}
let entries_per_difat_sector = sector_size / 4 - 1;
let mut next_difat = first_difat_sector;
for _ in 0..num_difat_sectors {
if next_difat == END_OF_CHAIN || next_difat == FREE_SECTOR {
break;
}
let sector = read_sector(bytes, sector_size, next_difat)?;
for index in 0..entries_per_difat_sector {
let sector_id = read_u32(sector, index * 4)?;
if sector_id != FREE_SECTOR {
difat.push(sector_id);
}
}
next_difat = read_u32(sector, sector_size - 4)?;
}
if difat.len() < num_fat_sectors {
return Err(LoError::Parse(format!(
"CFB FAT list too short: expected {num_fat_sectors}, found {}",
difat.len()
)));
}
let mut fat = Vec::new();
for &fat_sector in difat.iter().take(num_fat_sectors) {
let sector = read_sector(bytes, sector_size, fat_sector)?;
for chunk in sector.chunks_exact(4) {
fat.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
}
}
let directory_bytes = read_chain(bytes, sector_size, &fat, first_dir_sector, None)?;
let mut directory = Vec::new();
for entry in directory_bytes.chunks_exact(128) {
let name_len = read_u16(entry, 0x40)? as usize;
let object_type = entry[0x42];
if object_type == 0 {
continue;
}
let raw_name = if (2..=64).contains(&name_len) {
&entry[..name_len - 2]
} else {
&entry[..0]
};
let mut utf16 = Vec::new();
for chunk in raw_name.chunks_exact(2) {
utf16.push(u16::from_le_bytes([chunk[0], chunk[1]]));
}
let name = String::from_utf16(&utf16)
.unwrap_or_else(|_| String::from_utf8_lossy(raw_name).to_string())
.trim_end_matches('\0')
.to_string();
let starting_sector = read_u32(entry, 0x74)?;
let stream_size = if major_version == 3 {
read_u32(entry, 0x78)? as usize
} else {
read_u64(entry, 0x78)? as usize
};
directory.push(DirectoryEntry {
name,
object_type,
starting_sector,
stream_size,
});
}
let root = directory
.iter()
.find(|entry| entry.object_type == 5)
.ok_or_else(|| LoError::InvalidInput("CFB Root Entry not found".to_string()))?
.clone();
let mini_fat_bytes = if num_mini_fat_sectors > 0 {
read_chain(bytes, sector_size, &fat, first_mini_fat_sector, None)?
} else {
Vec::new()
};
let mut mini_fat = Vec::new();
for chunk in mini_fat_bytes.chunks_exact(4) {
mini_fat.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
}
let mini_stream = if root.stream_size > 0 && root.starting_sector != END_OF_CHAIN {
read_chain(
bytes,
sector_size,
&fat,
root.starting_sector,
Some(root.stream_size),
)?
} else {
Vec::new()
};
Ok(Self {
bytes: bytes.to_vec(),
sector_size,
mini_sector_size,
mini_stream_cutoff,
fat,
mini_fat,
directory,
mini_stream,
})
}
fn read_stream(&self, name: &str) -> Result<Vec<u8>> {
let entry = self
.directory
.iter()
.find(|entry| entry.object_type == 2 && entry.name.eq_ignore_ascii_case(name))
.ok_or_else(|| LoError::InvalidInput(format!("CFB stream not found: {name}")))?;
if entry.stream_size < self.mini_stream_cutoff {
self.read_mini_stream(entry.starting_sector, entry.stream_size)
} else {
read_chain(
&self.bytes,
self.sector_size,
&self.fat,
entry.starting_sector,
Some(entry.stream_size),
)
}
}
fn read_mini_stream(&self, start_sector: u32, size: usize) -> Result<Vec<u8>> {
if size == 0 {
return Ok(Vec::new());
}
if self.mini_stream.is_empty() {
return Err(LoError::Parse("CFB mini stream is empty".to_string()));
}
let mut sector = start_sector;
let mut out = Vec::new();
let mut seen = HashSet::new();
while sector != END_OF_CHAIN && sector != FREE_SECTOR {
if !seen.insert(sector) {
return Err(LoError::Parse("CFB mini FAT cycle detected".to_string()));
}
let offset = sector as usize * self.mini_sector_size;
let end = offset + self.mini_sector_size;
if end > self.mini_stream.len() {
return Err(LoError::Parse("CFB mini sector out of bounds".to_string()));
}
out.extend_from_slice(&self.mini_stream[offset..end]);
if out.len() >= size {
out.truncate(size);
break;
}
sector = *self
.mini_fat
.get(sector as usize)
.ok_or_else(|| LoError::Parse("CFB mini FAT reference out of range".to_string()))?;
}
Ok(out)
}
}
fn locate_clx(word_document: &[u8], table: &[u8]) -> Result<(usize, usize)> {
if word_document.len() >= 0x1AA {
let fc = read_u32(word_document, 0x1A2)? as usize;
let lcb = read_u32(word_document, 0x1A6)? as usize;
if looks_like_clx(table, fc, lcb) {
return Ok((fc, lcb));
}
}
let limit = word_document.len().min(0x400);
for offset in (0x80..limit.saturating_sub(8)).step_by(2) {
let fc = read_u32(word_document, offset)? as usize;
let lcb = read_u32(word_document, offset + 4)? as usize;
if looks_like_clx(table, fc, lcb) {
return Ok((fc, lcb));
}
}
Err(LoError::Parse(
"failed to locate the DOC piece table (CLX)".to_string(),
))
}
fn looks_like_clx(table: &[u8], fc: usize, lcb: usize) -> bool {
if lcb < 5 {
return false;
}
let Some(end) = fc.checked_add(lcb) else {
return false;
};
if end > table.len() {
return false;
}
let clx = &table[fc..end];
let mut index = 0usize;
while index < clx.len() {
match clx[index] {
0x01 => {
if index + 3 > clx.len() {
return false;
}
let size = u16::from_le_bytes([clx[index + 1], clx[index + 2]]) as usize;
index += 3 + size;
}
0x02 => return index + 5 <= clx.len(),
_ => return false,
}
}
false
}
fn parse_clx(clx: &[u8]) -> Result<Vec<Piece>> {
let mut index = 0usize;
while index < clx.len() {
match clx[index] {
0x01 => {
if index + 3 > clx.len() {
return Err(LoError::Parse("DOC CLX has a truncated RgPrc".to_string()));
}
let size = u16::from_le_bytes([clx[index + 1], clx[index + 2]]) as usize;
index += 3 + size;
}
0x02 => {
let lcb = read_u32(clx, index + 1)? as usize;
let start = index + 5;
let end = start
.checked_add(lcb)
.ok_or_else(|| LoError::Parse("DOC PlcPcd overflow".to_string()))?;
if end > clx.len() {
return Err(LoError::Parse("DOC PlcPcd exceeds CLX bounds".to_string()));
}
return parse_plcpcd(&clx[start..end]);
}
other => {
return Err(LoError::Parse(format!(
"unexpected DOC CLX tag 0x{other:02X}"
)))
}
}
}
Err(LoError::Parse(
"DOC CLX did not contain a piece table".to_string(),
))
}
fn parse_plcpcd(bytes: &[u8]) -> Result<Vec<Piece>> {
if bytes.len() < 4 || (bytes.len() - 4) % 12 != 0 {
return Err(LoError::Parse("DOC PlcPcd has an invalid size".to_string()));
}
let piece_count = (bytes.len() - 4) / 12;
let cp_count = piece_count + 1;
let pcd_offset = cp_count * 4;
let mut pieces = Vec::new();
for index in 0..piece_count {
let cp_start = read_u32(bytes, index * 4)? as usize;
let cp_end = read_u32(bytes, (index + 1) * 4)? as usize;
let raw_fc = read_u32(bytes, pcd_offset + index * 8 + 2)?;
pieces.push(Piece {
cp_start,
cp_end,
fc: raw_fc & 0x3FFF_FFFF,
compressed: (raw_fc & 0x4000_0000) != 0,
});
}
Ok(pieces)
}
fn extract_piece_text(word_document: &[u8], pieces: &[Piece]) -> Result<String> {
let mut out = String::new();
for piece in pieces {
if piece.cp_end <= piece.cp_start {
continue;
}
let chars = piece.cp_end - piece.cp_start;
if piece.compressed {
let start = (piece.fc / 2) as usize;
let end = start
.checked_add(chars)
.ok_or_else(|| LoError::Parse("DOC compressed piece overflow".to_string()))?;
if end > word_document.len() {
return Err(LoError::Parse(
"DOC compressed piece exceeds WordDocument bounds".to_string(),
));
}
out.push_str(&decode_compressed_text(&word_document[start..end]));
} else {
let start = piece.fc as usize;
let byte_len = chars
.checked_mul(2)
.ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
let end = start
.checked_add(byte_len)
.ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
if end > word_document.len() {
return Err(LoError::Parse(
"DOC unicode piece exceeds WordDocument bounds".to_string(),
));
}
out.push_str(&decode_utf16le(&word_document[start..end]));
}
}
Ok(out)
}
fn decode_utf16le(bytes: &[u8]) -> String {
let mut data = Vec::new();
for chunk in bytes.chunks_exact(2) {
data.push(u16::from_le_bytes([chunk[0], chunk[1]]));
}
String::from_utf16_lossy(&data)
}
fn decode_compressed_text(bytes: &[u8]) -> String {
let mut out = String::new();
for &byte in bytes {
let ch = match byte {
0x80 => '\u{20AC}',
0x82 => '\u{201A}',
0x83 => '\u{0192}',
0x84 => '\u{201E}',
0x85 => '\u{2026}',
0x86 => '\u{2020}',
0x87 => '\u{2021}',
0x88 => '\u{02C6}',
0x89 => '\u{2030}',
0x8A => '\u{0160}',
0x8B => '\u{2039}',
0x8C => '\u{0152}',
0x8E => '\u{017D}',
0x91 => '\u{2018}',
0x92 => '\u{2019}',
0x93 => '\u{201C}',
0x94 => '\u{201D}',
0x95 => '\u{2022}',
0x96 => '\u{2013}',
0x97 => '\u{2014}',
0x98 => '\u{02DC}',
0x99 => '\u{2122}',
0x9A => '\u{0161}',
0x9B => '\u{203A}',
0x9C => '\u{0153}',
0x9E => '\u{017E}',
0x9F => '\u{0178}',
value => value as char,
};
out.push(ch);
}
out
}
fn normalize_doc_text(raw: &str) -> String {
let mut out = String::new();
let mut last_was_space = false;
for ch in raw.chars() {
match ch {
'\r' => {
if !out.ends_with("\n\n") {
out.push_str("\n\n");
}
last_was_space = false;
}
'\n' | '\u{000B}' | '\u{000C}' => {
if !out.ends_with('\n') {
out.push('\n');
}
last_was_space = false;
}
'\u{0007}' => {
if !out.ends_with('\t') {
out.push('\t');
}
last_was_space = false;
}
'\u{0013}' | '\u{0014}' | '\u{0015}' | '\u{0000}' => {}
ch if ch.is_control() && ch != '\t' => {}
ch if ch.is_whitespace() => {
if !last_was_space {
out.push(' ');
last_was_space = true;
}
}
other => {
out.push(other);
last_was_space = false;
}
}
}
let mut cleaned = String::new();
let mut blank_run = 0usize;
for line in out.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
blank_run += 1;
if blank_run <= 1 && !cleaned.is_empty() && !cleaned.ends_with("\n\n") {
cleaned.push_str("\n\n");
}
} else {
blank_run = 0;
if !cleaned.is_empty() && !cleaned.ends_with("\n\n") {
cleaned.push_str("\n\n");
}
cleaned.push_str(trimmed);
}
}
cleaned.trim().to_string()
}
fn read_chain(
bytes: &[u8],
sector_size: usize,
fat: &[u32],
start_sector: u32,
size: Option<usize>,
) -> Result<Vec<u8>> {
if start_sector == END_OF_CHAIN || start_sector == FREE_SECTOR {
return Ok(Vec::new());
}
let mut out = Vec::new();
let mut seen = HashSet::new();
let mut sector = start_sector;
while sector != END_OF_CHAIN && sector != FREE_SECTOR {
if sector == FAT_SECTOR || sector == DIFAT_SECTOR {
return Err(LoError::Parse(
"CFB stream chain points to a reserved sector".to_string(),
));
}
if !seen.insert(sector) {
return Err(LoError::Parse("CFB FAT cycle detected".to_string()));
}
let data = read_sector(bytes, sector_size, sector)?;
out.extend_from_slice(data);
if let Some(limit) = size {
if out.len() >= limit {
out.truncate(limit);
break;
}
}
sector = *fat
.get(sector as usize)
.ok_or_else(|| LoError::Parse("CFB FAT reference out of range".to_string()))?;
}
Ok(out)
}
fn read_sector(bytes: &[u8], sector_size: usize, sector_id: u32) -> Result<&[u8]> {
let offset = (sector_id as usize + 1)
.checked_mul(sector_size)
.ok_or_else(|| LoError::Parse("CFB sector offset overflow".to_string()))?;
let end = offset
.checked_add(sector_size)
.ok_or_else(|| LoError::Parse("CFB sector end overflow".to_string()))?;
if end > bytes.len() {
return Err(LoError::Parse(
"CFB sector extends past the file end".to_string(),
));
}
Ok(&bytes[offset..end])
}
fn read_u16(bytes: &[u8], offset: usize) -> Result<u16> {
let slice = bytes
.get(offset..offset + 2)
.ok_or_else(|| LoError::Parse("unexpected EOF while reading u16".to_string()))?;
Ok(u16::from_le_bytes([slice[0], slice[1]]))
}
fn read_u32(bytes: &[u8], offset: usize) -> Result<u32> {
let slice = bytes
.get(offset..offset + 4)
.ok_or_else(|| LoError::Parse("unexpected EOF while reading u32".to_string()))?;
Ok(u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]))
}
fn read_u64(bytes: &[u8], offset: usize) -> Result<u64> {
let slice = bytes
.get(offset..offset + 8)
.ok_or_else(|| LoError::Parse("unexpected EOF while reading u64".to_string()))?;
Ok(u64::from_le_bytes([
slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
]))
}