lo_writer 0.3.7

Writer-like document editing with Markdown and plain text import/export
Documentation
//! Plain-text extractor for the legacy binary `.doc` format.
//!
//! Built on top of [`lo_core::CfbFile`] for the Compound File Binary
//! container, then walks the WordDocument piece table (CLX/PlcPcd) to
//! decode each text piece (compressed CP1252 or uncompressed UTF-16LE).

use lo_core::{CfbFile, LoError, Result};

pub fn extract_text_from_doc(bytes: &[u8]) -> Result<String> {
    let cfb = CfbFile::open(bytes)?;
    let word = cfb.read_stream("WordDocument")?;
    if word.len() < 0x20 {
        return Err(LoError::Parse(
            "DOC WordDocument stream is too small".to_string(),
        ));
    }

    let fib_flags = read_u16(&word, 0x0A)?;
    if (fib_flags & (1 << 8)) != 0 {
        return Err(LoError::Unsupported(
            "encrypted legacy DOC files are not supported by the pure-Rust parser".to_string(),
        ));
    }
    let use_1table = (fib_flags & (1 << 9)) != 0;
    let table_stream_name = if use_1table { "1Table" } else { "0Table" };
    let table = cfb
        .read_stream(table_stream_name)
        .or_else(|_| cfb.read_stream(if use_1table { "0Table" } else { "1Table" }))?;

    let fc_clx = read_u32(&word, 0x01A2)? as usize;
    let lcb_clx = read_u32(&word, 0x01A6)? as usize;
    let clx_end = fc_clx
        .checked_add(lcb_clx)
        .ok_or_else(|| LoError::Parse("DOC CLX overflow".to_string()))?;
    if clx_end > table.len() {
        return Err(LoError::Parse(
            "DOC CLX extends past the table stream".to_string(),
        ));
    }
    let clx = &table[fc_clx..clx_end];
    let plcpcd = extract_plcpcd(clx)?;
    let piece_count = plcpcd_piece_count(plcpcd.len())?;
    let cp_count = piece_count + 1;
    let mut cps = Vec::with_capacity(cp_count);
    for index in 0..cp_count {
        cps.push(read_u32(plcpcd, index * 4)? as usize);
    }

    let pcd_base = cp_count * 4;
    let mut out = String::new();
    for piece_index in 0..piece_count {
        let char_count = cps[piece_index + 1].saturating_sub(cps[piece_index]);
        let pcd = plcpcd
            .get(pcd_base + piece_index * 8..pcd_base + (piece_index + 1) * 8)
            .ok_or_else(|| LoError::Parse("DOC PCD out of bounds".to_string()))?;
        let fc_compressed = read_u32(pcd, 2)?;
        let is_compressed = (fc_compressed & 0x4000_0000) != 0;
        let fc = (fc_compressed & 0x3FFF_FFFF) as usize;
        if is_compressed {
            let start = fc / 2;
            let end = start
                .checked_add(char_count)
                .ok_or_else(|| LoError::Parse("DOC ANSI piece overflow".to_string()))?;
            let data = word
                .get(start..end)
                .ok_or_else(|| LoError::Parse("DOC ANSI piece out of bounds".to_string()))?;
            for &byte in data {
                push_doc_char(&mut out, decode_cp1252(byte));
            }
        } else {
            let start = fc;
            let byte_len = char_count
                .checked_mul(2)
                .ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
            let end = start
                .checked_add(byte_len)
                .ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
            let data = word
                .get(start..end)
                .ok_or_else(|| LoError::Parse("DOC unicode piece out of bounds".to_string()))?;
            for chunk in data.chunks_exact(2) {
                let unit = u16::from_le_bytes([chunk[0], chunk[1]]);
                if let Some(ch) = char::from_u32(unit as u32) {
                    push_doc_char(&mut out, ch);
                }
            }
        }
    }

    let normalized = out
        .replace('\r', "\n")
        .replace('\u{0007}', "\t")
        .replace('\u{000B}', "\n")
        .replace('\u{000C}', "\n\n");
    Ok(collapse_blank_lines(&normalized))
}

fn extract_plcpcd(clx: &[u8]) -> Result<&[u8]> {
    let mut offset = 0usize;
    while offset < clx.len() {
        match clx[offset] {
            0x01 => {
                let cb = read_u16(clx, offset + 1)? as usize;
                offset += 3 + cb;
            }
            0x02 => {
                let lcb = read_u32(clx, offset + 1)? as usize;
                return clx
                    .get(offset + 5..offset + 5 + lcb)
                    .ok_or_else(|| LoError::Parse("DOC PlcPcd out of bounds".to_string()));
            }
            other => {
                return Err(LoError::Parse(format!(
                    "unexpected DOC CLX byte {other:#x}"
                )))
            }
        }
    }
    Err(LoError::Parse(
        "DOC CLX does not contain a PlcPcd".to_string(),
    ))
}

fn plcpcd_piece_count(length: usize) -> Result<usize> {
    if length < 4 || (length - 4) % 12 != 0 {
        return Err(LoError::Parse("DOC PlcPcd size is invalid".to_string()));
    }
    Ok((length - 4) / 12)
}

fn push_doc_char(out: &mut String, ch: char) {
    match ch {
        '\u{0000}' | '\u{0013}' | '\u{0014}' | '\u{0015}' => {}
        other => out.push(other),
    }
}

fn collapse_blank_lines(text: &str) -> String {
    let mut out = String::new();
    let mut blank_run = 0usize;
    for line in text.lines() {
        if line.trim().is_empty() {
            blank_run += 1;
            if blank_run <= 1 {
                out.push('\n');
            }
        } else {
            blank_run = 0;
            if !out.is_empty() && !out.ends_with('\n') {
                out.push('\n');
            }
            out.push_str(line.trim_end());
            out.push('\n');
        }
    }
    out.trim_matches('\n').to_string()
}

fn read_u16(bytes: &[u8], offset: usize) -> Result<u16> {
    let slice = bytes
        .get(offset..offset + 2)
        .ok_or_else(|| LoError::Parse(format!("DOC read_u16 out of bounds at {offset}")))?;
    Ok(u16::from_le_bytes([slice[0], slice[1]]))
}

fn read_u32(bytes: &[u8], offset: usize) -> Result<u32> {
    let slice = bytes
        .get(offset..offset + 4)
        .ok_or_else(|| LoError::Parse(format!("DOC read_u32 out of bounds at {offset}")))?;
    Ok(u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]))
}

fn decode_cp1252(byte: u8) -> char {
    match byte {
        0x80 => '\u{20AC}',
        0x82 => '\u{201A}',
        0x83 => '\u{0192}',
        0x84 => '\u{201E}',
        0x85 => '\u{2026}',
        0x86 => '\u{2020}',
        0x87 => '\u{2021}',
        0x88 => '\u{02C6}',
        0x89 => '\u{2030}',
        0x8A => '\u{0160}',
        0x8B => '\u{2039}',
        0x8C => '\u{0152}',
        0x8E => '\u{017D}',
        0x91 => '\u{2018}',
        0x92 => '\u{2019}',
        0x93 => '\u{201C}',
        0x94 => '\u{201D}',
        0x95 => '\u{2022}',
        0x96 => '\u{2013}',
        0x97 => '\u{2014}',
        0x98 => '\u{02DC}',
        0x99 => '\u{2122}',
        0x9A => '\u{0161}',
        0x9B => '\u{203A}',
        0x9C => '\u{0153}',
        0x9E => '\u{017E}',
        0x9F => '\u{0178}',
        value => value as char,
    }
}