use crate::error::{KreuzbergError, Result};
use std::io::Cursor;
pub struct DocExtractionResult {
pub text: String,
pub metadata: DocMetadata,
}
#[derive(Default)]
pub struct DocMetadata {
pub title: Option<String>,
pub subject: Option<String>,
pub author: Option<String>,
pub last_author: Option<String>,
pub created: Option<String>,
pub modified: Option<String>,
pub revision_number: Option<String>,
}
pub fn extract_doc_text(content: &[u8]) -> Result<DocExtractionResult> {
let cursor = Cursor::new(content);
let mut comp = cfb::CompoundFile::open(cursor)
.map_err(|e| KreuzbergError::parsing(format!("Failed to open DOC as OLE container: {e}")))?;
let metadata = extract_doc_metadata(&mut comp);
let word_doc = read_stream(&mut comp, "/WordDocument")?;
if word_doc.len() < 12 {
return Err(KreuzbergError::parsing("WordDocument stream too short"));
}
let w_ident = u16::from_le_bytes([word_doc[0], word_doc[1]]);
if w_ident != 0xA5EC {
return Err(KreuzbergError::parsing(format!(
"Invalid DOC magic number: 0x{w_ident:04X}, expected 0xA5EC"
)));
}
let n_fib = u16::from_le_bytes([word_doc[2], word_doc[3]]);
let flags_a = u16::from_le_bytes([word_doc[0x0A], word_doc[0x0B]]);
let use_1table = (flags_a & 0x0200) != 0;
let table_stream_name = if use_1table { "/1Table" } else { "/0Table" };
let table_stream = read_stream(&mut comp, table_stream_name)?;
if n_fib >= 101 {
extract_text_word97(&word_doc, &table_stream).map(|text| DocExtractionResult { text, metadata })
} else {
extract_text_word6(&word_doc).map(|text| DocExtractionResult { text, metadata })
}
}
fn extract_text_word97(word_doc: &[u8], table_stream: &[u8]) -> Result<String> {
let fib_base_size = 32; let csw_offset = fib_base_size;
if word_doc.len() < csw_offset + 2 {
return Err(KreuzbergError::parsing("FIB too short for csw"));
}
let csw = u16::from_le_bytes([word_doc[csw_offset], word_doc[csw_offset + 1]]) as usize;
let rg_w_offset = csw_offset + 2;
let cslw_offset = rg_w_offset + csw * 2;
if word_doc.len() < cslw_offset + 2 {
return Err(KreuzbergError::parsing("FIB too short for cslw"));
}
let cslw = u16::from_le_bytes([word_doc[cslw_offset], word_doc[cslw_offset + 1]]) as usize;
let rg_lw_offset = cslw_offset + 2;
let ccp_text_offset = rg_lw_offset + 3 * 4;
if word_doc.len() < ccp_text_offset + 4 {
return Err(KreuzbergError::parsing("FIB too short for ccpText"));
}
let ccp_text = u32::from_le_bytes([
word_doc[ccp_text_offset],
word_doc[ccp_text_offset + 1],
word_doc[ccp_text_offset + 2],
word_doc[ccp_text_offset + 3],
]) as usize;
let mut total_cp = ccp_text;
for i in 4..=9 {
let off = rg_lw_offset + i * 4;
if word_doc.len() >= off + 4 {
total_cp +=
u32::from_le_bytes([word_doc[off], word_doc[off + 1], word_doc[off + 2], word_doc[off + 3]]) as usize;
}
}
if total_cp > 0 {
total_cp += 1;
}
let cbrgfclcb_offset = rg_lw_offset + cslw * 4;
if word_doc.len() < cbrgfclcb_offset + 2 {
return Err(KreuzbergError::parsing("FIB too short for cbRgFcLcb"));
}
let _ = u16::from_le_bytes([word_doc[cbrgfclcb_offset], word_doc[cbrgfclcb_offset + 1]]) as usize;
let rg_fc_lcb_offset = cbrgfclcb_offset + 2;
let fc_clx_offset = rg_fc_lcb_offset + 66 * 8;
let lcb_clx_offset = fc_clx_offset + 4;
if word_doc.len() < lcb_clx_offset + 4 {
return Err(KreuzbergError::parsing("FIB too short for fcClx/lcbClx"));
}
let fc_clx = u32::from_le_bytes([
word_doc[fc_clx_offset],
word_doc[fc_clx_offset + 1],
word_doc[fc_clx_offset + 2],
word_doc[fc_clx_offset + 3],
]) as usize;
let lcb_clx = u32::from_le_bytes([
word_doc[lcb_clx_offset],
word_doc[lcb_clx_offset + 1],
word_doc[lcb_clx_offset + 2],
word_doc[lcb_clx_offset + 3],
]) as usize;
if fc_clx == 0 || lcb_clx == 0 {
return extract_text_contiguous(word_doc, ccp_text);
}
if table_stream.len() < fc_clx + lcb_clx {
return Err(KreuzbergError::parsing("CLX extends beyond table stream"));
}
let clx = &table_stream[fc_clx..fc_clx + lcb_clx];
let mut pos = 0;
while pos < clx.len() {
let clxt = clx[pos];
if clxt == 0x02 {
pos += 1;
if pos + 4 > clx.len() {
return Err(KreuzbergError::parsing("Pcdt truncated at lcb"));
}
let _ = u32::from_le_bytes([clx[pos], clx[pos + 1], clx[pos + 2], clx[pos + 3]]) as usize;
pos += 4;
let plc_pcd = &clx[pos..];
return extract_text_from_piece_table(word_doc, plc_pcd, ccp_text, total_cp);
} else if clxt == 0x01 {
pos += 1;
if pos + 2 > clx.len() {
break;
}
let cb_grpprl = u16::from_le_bytes([clx[pos], clx[pos + 1]]) as usize;
pos += 2 + cb_grpprl;
} else {
break;
}
}
extract_text_fallback(word_doc, ccp_text)
}
fn extract_text_from_piece_table(word_doc: &[u8], plc_pcd: &[u8], ccp_text: usize, total_cp: usize) -> Result<String> {
let plc_size = plc_pcd.len();
if plc_size < 16 {
return Err(KreuzbergError::parsing("PlcPcd too small"));
}
let n = (plc_size - 4) / 12;
if n == 0 {
return Ok(String::new());
}
let mut result = String::with_capacity(ccp_text);
for i in 0..n {
let cp_start_off = i * 4;
let cp_end_off = (i + 1) * 4;
let pcd_off = (n + 1) * 4 + i * 8;
if cp_end_off + 4 > plc_size || pcd_off + 8 > plc_size {
break;
}
let cp_start = u32::from_le_bytes([
plc_pcd[cp_start_off],
plc_pcd[cp_start_off + 1],
plc_pcd[cp_start_off + 2],
plc_pcd[cp_start_off + 3],
]) as usize;
let cp_end = u32::from_le_bytes([
plc_pcd[cp_end_off],
plc_pcd[cp_end_off + 1],
plc_pcd[cp_end_off + 2],
plc_pcd[cp_end_off + 3],
]) as usize;
if cp_start >= total_cp {
break;
}
let fc_raw = u32::from_le_bytes([
plc_pcd[pcd_off + 2],
plc_pcd[pcd_off + 3],
plc_pcd[pcd_off + 4],
plc_pcd[pcd_off + 5],
]);
let is_compressed = (fc_raw & 0x4000_0000) != 0;
let char_count = cp_end.saturating_sub(cp_start);
let chars_to_read = if cp_start + char_count > ccp_text && cp_start < ccp_text {
ccp_text - cp_start
} else if cp_start >= ccp_text {
continue;
} else {
char_count
};
if is_compressed {
let byte_offset = (fc_raw & 0x3FFF_FFFF) as usize / 2;
let end = byte_offset + chars_to_read;
if end <= word_doc.len() {
let bytes = &word_doc[byte_offset..end];
for &b in bytes {
result.push(cp1252_to_char(b));
}
}
} else {
let byte_offset = (fc_raw & 0x3FFF_FFFF) as usize;
let end = byte_offset + chars_to_read * 2;
if end <= word_doc.len() {
let bytes = &word_doc[byte_offset..end];
for chunk in bytes.chunks_exact(2) {
let code_unit = u16::from_le_bytes([chunk[0], chunk[1]]);
if let Some(c) = char::from_u32(code_unit as u32) {
result.push(c);
}
}
}
}
}
Ok(normalize_doc_text(&result))
}
fn extract_text_contiguous(word_doc: &[u8], ccp_text: usize) -> Result<String> {
if word_doc.len() < 0x20 {
return extract_text_fallback(word_doc, ccp_text);
}
let fc_min = u32::from_le_bytes([word_doc[0x18], word_doc[0x19], word_doc[0x1A], word_doc[0x1B]]) as usize;
let fc_mac = u32::from_le_bytes([word_doc[0x1C], word_doc[0x1D], word_doc[0x1E], word_doc[0x1F]]) as usize;
if fc_min == 0 || fc_min >= word_doc.len() {
return extract_text_fallback(word_doc, ccp_text);
}
let data_len = fc_mac.saturating_sub(fc_min).min(word_doc.len() - fc_min);
if data_len == 0 {
return extract_text_fallback(word_doc, ccp_text);
}
let text_data = &word_doc[fc_min..fc_min + data_len];
let null_count = text_data.iter().filter(|&&b| b == 0).count();
let is_unicode = data_len >= ccp_text * 2 || null_count > data_len / 4;
let text = if is_unicode {
let chars: Vec<u16> = text_data
.chunks_exact(2)
.take(ccp_text)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&chars)
} else {
text_data.iter().take(ccp_text).map(|&b| cp1252_to_char(b)).collect()
};
let normalized = normalize_doc_text(&text);
if normalized.is_empty() {
return extract_text_fallback(word_doc, ccp_text);
}
Ok(normalized)
}
fn extract_text_fallback(word_doc: &[u8], _ccp_text: usize) -> Result<String> {
let mut result = String::new();
let mut text_run = String::new();
for &b in word_doc.iter().skip(256) {
if b == 0x0D || b == 0x0A || b == 0x09 || (0x20..=0xFE).contains(&b) {
text_run.push(cp1252_to_char(b));
} else if !text_run.is_empty() {
if text_run.len() >= 3 {
if !result.is_empty() {
result.push(' ');
}
result.push_str(&text_run);
}
text_run.clear();
}
}
if text_run.len() >= 3 {
if !result.is_empty() {
result.push(' ');
}
result.push_str(&text_run);
}
if result.is_empty() {
return Err(KreuzbergError::parsing("No text content found in DOC file"));
}
Ok(normalize_doc_text(&result))
}
fn extract_text_word6(word_doc: &[u8]) -> Result<String> {
if word_doc.len() < 0x50 {
return Err(KreuzbergError::parsing("Word 6/95 file too short"));
}
let ccp_text = u32::from_le_bytes([word_doc[0x4C], word_doc[0x4D], word_doc[0x4E], word_doc[0x4F]]) as usize;
let fc_min = u32::from_le_bytes([word_doc[0x18], word_doc[0x19], word_doc[0x1A], word_doc[0x1B]]) as usize;
if fc_min + ccp_text > word_doc.len() {
return extract_text_fallback(word_doc, ccp_text);
}
let text_bytes = &word_doc[fc_min..fc_min + ccp_text];
let mut result = String::with_capacity(ccp_text);
for &b in text_bytes {
result.push(cp1252_to_char(b));
}
Ok(normalize_doc_text(&result))
}
fn normalize_doc_text(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for c in text.chars() {
match c {
'\r' => result.push('\n'),
'\x07' => result.push('\t'), '\x0B' => result.push('\n'), '\x0C' => result.push('\n'), '\x01' | '\x08' | '\x13' | '\x14' | '\x15' => {} c if c < '\x20' && c != '\n' && c != '\t' => {} _ => result.push(c),
}
}
let mut prev_newline = false;
let mut prev_prev_newline = false;
let mut cleaned = String::with_capacity(result.len());
for c in result.chars() {
if c == '\n' {
if prev_prev_newline && prev_newline {
continue; }
prev_prev_newline = prev_newline;
prev_newline = true;
} else {
prev_prev_newline = false;
prev_newline = false;
}
cleaned.push(c);
}
cleaned.trim().to_string()
}
fn cp1252_to_char(b: u8) -> char {
match b {
0x80 => '\u{20AC}', 0x82 => '\u{201A}', 0x83 => '\u{0192}', 0x84 => '\u{201E}', 0x85 => '\u{2026}', 0x86 => '\u{2020}', 0x87 => '\u{2021}', 0x88 => '\u{02C6}', 0x89 => '\u{2030}', 0x8A => '\u{0160}', 0x8B => '\u{2039}', 0x8C => '\u{0152}', 0x8E => '\u{017D}', 0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '\u{201C}', 0x94 => '\u{201D}', 0x95 => '\u{2022}', 0x96 => '\u{2013}', 0x97 => '\u{2014}', 0x98 => '\u{02DC}', 0x99 => '\u{2122}', 0x9A => '\u{0161}', 0x9B => '\u{203A}', 0x9C => '\u{0153}', 0x9E => '\u{017E}', 0x9F => '\u{0178}', b => b as char,
}
}
fn read_stream(comp: &mut cfb::CompoundFile<Cursor<&[u8]>>, name: &str) -> Result<Vec<u8>> {
use std::io::Read;
let mut stream = comp
.open_stream(name)
.map_err(|e| KreuzbergError::parsing(format!("Failed to open stream '{name}': {e}")))?;
let mut data = Vec::new();
stream
.read_to_end(&mut data)
.map_err(|e| KreuzbergError::parsing(format!("Failed to read stream '{name}': {e}")))?;
Ok(data)
}
fn extract_doc_metadata(comp: &mut cfb::CompoundFile<Cursor<&[u8]>>) -> DocMetadata {
let mut meta = DocMetadata::default();
if let Ok(data) = read_stream(comp, "/\x05SummaryInformation") {
parse_summary_info(&data, &mut meta);
}
if let Ok(data) = read_stream(comp, "/\x05DocumentSummaryInformation") {
parse_doc_summary_info(&data, &mut meta);
}
meta
}
fn parse_summary_info(data: &[u8], meta: &mut DocMetadata) {
if data.len() < 28 {
return;
}
let offset = 24;
if data.len() < offset + 4 {
return;
}
let num_sets = u32::from_le_bytes([data[offset], data[offset + 1], data[offset + 2], data[offset + 3]]) as usize;
if num_sets == 0 {
return;
}
if data.len() < 48 {
return;
}
let set_offset = u32::from_le_bytes([data[44], data[45], data[46], data[47]]) as usize;
parse_property_set(data, set_offset, meta, false);
}
fn parse_doc_summary_info(data: &[u8], meta: &mut DocMetadata) {
if data.len() < 48 {
return;
}
let set_offset = u32::from_le_bytes([data[44], data[45], data[46], data[47]]) as usize;
parse_property_set(data, set_offset, meta, true);
}
fn parse_property_set(data: &[u8], set_offset: usize, meta: &mut DocMetadata, _is_doc_summary: bool) {
if set_offset + 8 > data.len() {
return;
}
let num_props = u32::from_le_bytes([
data[set_offset + 4],
data[set_offset + 5],
data[set_offset + 6],
data[set_offset + 7],
]) as usize;
let props_start = set_offset + 8;
for i in 0..num_props {
let entry_offset = props_start + i * 8;
if entry_offset + 8 > data.len() {
break;
}
let prop_id = u32::from_le_bytes([
data[entry_offset],
data[entry_offset + 1],
data[entry_offset + 2],
data[entry_offset + 3],
]);
let prop_offset = u32::from_le_bytes([
data[entry_offset + 4],
data[entry_offset + 5],
data[entry_offset + 6],
data[entry_offset + 7],
]) as usize;
let abs_offset = set_offset + prop_offset;
if abs_offset + 8 > data.len() {
continue;
}
if let Some(value) = read_property_value(data, abs_offset) {
match prop_id {
2 => meta.title = Some(value),
3 => meta.subject = Some(value),
4 => meta.author = Some(value),
8 => meta.last_author = Some(value),
9 => meta.revision_number = Some(value),
_ => {}
}
}
}
}
fn read_property_value(data: &[u8], offset: usize) -> Option<String> {
if offset + 8 > data.len() {
return None;
}
let vt_type = u32::from_le_bytes([data[offset], data[offset + 1], data[offset + 2], data[offset + 3]]);
match vt_type {
30 => {
let len =
u32::from_le_bytes([data[offset + 4], data[offset + 5], data[offset + 6], data[offset + 7]]) as usize;
if len == 0 || offset + 8 + len > data.len() {
return None;
}
let bytes = &data[offset + 8..offset + 8 + len];
let trimmed = bytes.iter().take_while(|&&b| b != 0).copied().collect::<Vec<_>>();
Some(String::from_utf8_lossy(&trimmed).to_string())
}
31 => {
let len =
u32::from_le_bytes([data[offset + 4], data[offset + 5], data[offset + 6], data[offset + 7]]) as usize;
if len == 0 || offset + 8 + len * 2 > data.len() {
return None;
}
let bytes = &data[offset + 8..offset + 8 + len * 2];
let chars: Vec<u16> = bytes
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.take_while(|&c| c != 0)
.collect();
Some(String::from_utf16_lossy(&chars))
}
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cp1252_to_char_ascii() {
assert_eq!(cp1252_to_char(b'A'), 'A');
assert_eq!(cp1252_to_char(b' '), ' ');
assert_eq!(cp1252_to_char(b'\n'), '\n');
}
#[test]
fn test_cp1252_to_char_special() {
assert_eq!(cp1252_to_char(0x80), '\u{20AC}'); assert_eq!(cp1252_to_char(0x93), '\u{201C}'); assert_eq!(cp1252_to_char(0x94), '\u{201D}'); assert_eq!(cp1252_to_char(0x96), '\u{2013}'); }
#[test]
fn test_normalize_doc_text() {
assert_eq!(normalize_doc_text("Hello\rWorld"), "Hello\nWorld");
assert_eq!(normalize_doc_text("A\x07B"), "A\tB");
assert_eq!(normalize_doc_text("A\x0BB"), "A\nB");
assert_eq!(normalize_doc_text("A\n\n\n\nB"), "A\n\nB");
}
#[test]
fn test_normalize_doc_text_field_codes() {
assert_eq!(normalize_doc_text("A\x13FIELD\x14result\x15B"), "AFIELDresultB");
}
#[test]
fn test_extract_doc_real_file() {
let test_file = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("../../test_documents/vendored/unstructured/doc/simple.doc");
if !test_file.exists() {
return; }
let content = std::fs::read(&test_file).expect("Failed to read test DOC");
let result = extract_doc_text(&content).expect("Failed to extract DOC text");
assert!(!result.text.is_empty(), "DOC extraction should produce text");
}
#[test]
fn test_extract_doc_fake_file() {
let test_file = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("../../test_documents/vendored/unstructured/doc/fake.doc");
if !test_file.exists() {
return;
}
let content = std::fs::read(&test_file).expect("Failed to read test DOC");
let result = extract_doc_text(&content).expect("Failed to extract DOC text");
assert!(!result.text.is_empty(), "DOC extraction should produce text");
}
#[test]
fn test_extract_doc_invalid_magic() {
let result = extract_doc_text(b"not a doc file");
assert!(result.is_err());
}
}