use lo_core::{CfbFile, LoError, Result};
pub fn extract_text_from_doc(bytes: &[u8]) -> Result<String> {
let cfb = CfbFile::open(bytes)?;
let word = cfb.read_stream("WordDocument")?;
if word.len() < 0x20 {
return Err(LoError::Parse(
"DOC WordDocument stream is too small".to_string(),
));
}
let fib_flags = read_u16(&word, 0x0A)?;
if (fib_flags & (1 << 8)) != 0 {
return Err(LoError::Unsupported(
"encrypted legacy DOC files are not supported by the pure-Rust parser".to_string(),
));
}
let use_1table = (fib_flags & (1 << 9)) != 0;
let table_stream_name = if use_1table { "1Table" } else { "0Table" };
let table = cfb
.read_stream(table_stream_name)
.or_else(|_| cfb.read_stream(if use_1table { "0Table" } else { "1Table" }))?;
let fc_clx = read_u32(&word, 0x01A2)? as usize;
let lcb_clx = read_u32(&word, 0x01A6)? as usize;
let clx_end = fc_clx
.checked_add(lcb_clx)
.ok_or_else(|| LoError::Parse("DOC CLX overflow".to_string()))?;
if clx_end > table.len() {
return Err(LoError::Parse(
"DOC CLX extends past the table stream".to_string(),
));
}
let clx = &table[fc_clx..clx_end];
let plcpcd = extract_plcpcd(clx)?;
let piece_count = plcpcd_piece_count(plcpcd.len())?;
let cp_count = piece_count + 1;
let mut cps = Vec::with_capacity(cp_count);
for index in 0..cp_count {
cps.push(read_u32(plcpcd, index * 4)? as usize);
}
let pcd_base = cp_count * 4;
let mut out = String::new();
for piece_index in 0..piece_count {
let char_count = cps[piece_index + 1].saturating_sub(cps[piece_index]);
let pcd = plcpcd
.get(pcd_base + piece_index * 8..pcd_base + (piece_index + 1) * 8)
.ok_or_else(|| LoError::Parse("DOC PCD out of bounds".to_string()))?;
let fc_compressed = read_u32(pcd, 2)?;
let is_compressed = (fc_compressed & 0x4000_0000) != 0;
let fc = (fc_compressed & 0x3FFF_FFFF) as usize;
if is_compressed {
let start = fc / 2;
let end = start
.checked_add(char_count)
.ok_or_else(|| LoError::Parse("DOC ANSI piece overflow".to_string()))?;
let data = word
.get(start..end)
.ok_or_else(|| LoError::Parse("DOC ANSI piece out of bounds".to_string()))?;
for &byte in data {
push_doc_char(&mut out, decode_cp1252(byte));
}
} else {
let start = fc;
let byte_len = char_count
.checked_mul(2)
.ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
let end = start
.checked_add(byte_len)
.ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
let data = word
.get(start..end)
.ok_or_else(|| LoError::Parse("DOC unicode piece out of bounds".to_string()))?;
for chunk in data.chunks_exact(2) {
let unit = u16::from_le_bytes([chunk[0], chunk[1]]);
if let Some(ch) = char::from_u32(unit as u32) {
push_doc_char(&mut out, ch);
}
}
}
}
let normalized = out
.replace('\r', "\n")
.replace('\u{0007}', "\t")
.replace('\u{000B}', "\n")
.replace('\u{000C}', "\n\n");
Ok(collapse_blank_lines(&normalized))
}
fn extract_plcpcd(clx: &[u8]) -> Result<&[u8]> {
let mut offset = 0usize;
while offset < clx.len() {
match clx[offset] {
0x01 => {
let cb = read_u16(clx, offset + 1)? as usize;
offset += 3 + cb;
}
0x02 => {
let lcb = read_u32(clx, offset + 1)? as usize;
return clx
.get(offset + 5..offset + 5 + lcb)
.ok_or_else(|| LoError::Parse("DOC PlcPcd out of bounds".to_string()));
}
other => {
return Err(LoError::Parse(format!(
"unexpected DOC CLX byte {other:#x}"
)))
}
}
}
Err(LoError::Parse(
"DOC CLX does not contain a PlcPcd".to_string(),
))
}
fn plcpcd_piece_count(length: usize) -> Result<usize> {
if length < 4 || (length - 4) % 12 != 0 {
return Err(LoError::Parse("DOC PlcPcd size is invalid".to_string()));
}
Ok((length - 4) / 12)
}
fn push_doc_char(out: &mut String, ch: char) {
match ch {
'\u{0000}' | '\u{0013}' | '\u{0014}' | '\u{0015}' => {}
other => out.push(other),
}
}
fn collapse_blank_lines(text: &str) -> String {
let mut out = String::new();
let mut blank_run = 0usize;
for line in text.lines() {
if line.trim().is_empty() {
blank_run += 1;
if blank_run <= 1 {
out.push('\n');
}
} else {
blank_run = 0;
if !out.is_empty() && !out.ends_with('\n') {
out.push('\n');
}
out.push_str(line.trim_end());
out.push('\n');
}
}
out.trim_matches('\n').to_string()
}
fn read_u16(bytes: &[u8], offset: usize) -> Result<u16> {
let slice = bytes
.get(offset..offset + 2)
.ok_or_else(|| LoError::Parse(format!("DOC read_u16 out of bounds at {offset}")))?;
Ok(u16::from_le_bytes([slice[0], slice[1]]))
}
fn read_u32(bytes: &[u8], offset: usize) -> Result<u32> {
let slice = bytes
.get(offset..offset + 4)
.ok_or_else(|| LoError::Parse(format!("DOC read_u32 out of bounds at {offset}")))?;
Ok(u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]))
}
fn decode_cp1252(byte: u8) -> char {
match byte {
0x80 => '\u{20AC}',
0x82 => '\u{201A}',
0x83 => '\u{0192}',
0x84 => '\u{201E}',
0x85 => '\u{2026}',
0x86 => '\u{2020}',
0x87 => '\u{2021}',
0x88 => '\u{02C6}',
0x89 => '\u{2030}',
0x8A => '\u{0160}',
0x8B => '\u{2039}',
0x8C => '\u{0152}',
0x8E => '\u{017D}',
0x91 => '\u{2018}',
0x92 => '\u{2019}',
0x93 => '\u{201C}',
0x94 => '\u{201D}',
0x95 => '\u{2022}',
0x96 => '\u{2013}',
0x97 => '\u{2014}',
0x98 => '\u{02DC}',
0x99 => '\u{2122}',
0x9A => '\u{0161}',
0x9B => '\u{203A}',
0x9C => '\u{0153}',
0x9E => '\u{017E}',
0x9F => '\u{0178}',
value => value as char,
}
}