lo_writer 0.3.3

//! Pure-Rust reader for the legacy binary `.doc` format.
//!
//! Parses a Compound File Binary (CFB) container, locates the
//! `WordDocument` and table streams, and extracts the piece-table
//! text. This intentionally returns plain text (not styled runs)
//! because the binary format predates anything resembling DOCX's
//! structure; callers can re-flow it through `from_plain_text`.

use std::collections::HashSet;

use lo_core::{LoError, Result};

const CFB_SIGNATURE: &[u8; 8] = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1";
const FREE_SECTOR: u32 = 0xFFFF_FFFF;
const END_OF_CHAIN: u32 = 0xFFFF_FFFE;
const FAT_SECTOR: u32 = 0xFFFF_FFFD;
const DIFAT_SECTOR: u32 = 0xFFFF_FFFC;

#[derive(Clone, Debug)]
struct DirectoryEntry {
    name: String,
    object_type: u8,
    starting_sector: u32,
    stream_size: usize,
}

#[derive(Clone, Debug)]
struct CompoundFile {
    bytes: Vec<u8>,
    sector_size: usize,
    mini_sector_size: usize,
    mini_stream_cutoff: usize,
    fat: Vec<u32>,
    mini_fat: Vec<u32>,
    directory: Vec<DirectoryEntry>,
    mini_stream: Vec<u8>,
}

#[derive(Clone, Debug)]
struct Piece {
    cp_start: usize,
    cp_end: usize,
    fc: u32,
    compressed: bool,
}

pub fn extract_text_from_doc(bytes: &[u8]) -> Result<String> {
    let compound = CompoundFile::open(bytes)?;
    let word_document = compound.read_stream("WordDocument")?;
    if word_document.len() < 0x20 {
        return Err(LoError::Parse(
            "DOC WordDocument stream is too small".to_string(),
        ));
    }

    let fib_flags = read_u16(&word_document, 0x0A)?;
    if (fib_flags & (1 << 8)) != 0 {
        return Err(LoError::Unsupported(
            "encrypted legacy DOC files are not supported by the pure-Rust parser".to_string(),
        ));
    }
    let use_1table = (fib_flags & (1 << 9)) != 0;
    let table_stream = if use_1table { "1Table" } else { "0Table" };
    let table = compound.read_stream(table_stream)?;

    let (fc_clx, lcb_clx) = locate_clx(&word_document, &table)?;
    let end = fc_clx
        .checked_add(lcb_clx)
        .ok_or_else(|| LoError::Parse("DOC CLX overflow".to_string()))?;
    if end > table.len() {
        return Err(LoError::Parse(
            "DOC CLX extends past the table stream".to_string(),
        ));
    }
    let pieces = parse_clx(&table[fc_clx..end])?;
    let raw_text = extract_piece_text(&word_document, &pieces)?;
    Ok(normalize_doc_text(&raw_text))
}

impl CompoundFile {
    fn open(bytes: &[u8]) -> Result<Self> {
        if bytes.len() < 512 {
            return Err(LoError::Parse(
                "CFB file is smaller than its 512-byte header".to_string(),
            ));
        }
        if &bytes[..8] != CFB_SIGNATURE {
            return Err(LoError::Parse(
                "not a Compound File Binary document".to_string(),
            ));
        }
        let major_version = read_u16(bytes, 0x1A)?;
        let sector_shift = read_u16(bytes, 0x1E)?;
        let mini_sector_shift = read_u16(bytes, 0x20)?;
        let sector_size = 1usize
            .checked_shl(sector_shift as u32)
            .ok_or_else(|| LoError::Parse("invalid CFB sector shift".to_string()))?;
        let mini_sector_size = 1usize
            .checked_shl(mini_sector_shift as u32)
            .ok_or_else(|| LoError::Parse("invalid CFB mini sector shift".to_string()))?;
        let num_fat_sectors = read_u32(bytes, 0x2C)? as usize;
        let first_dir_sector = read_u32(bytes, 0x30)?;
        let mini_stream_cutoff = read_u32(bytes, 0x38)? as usize;
        let first_mini_fat_sector = read_u32(bytes, 0x3C)?;
        let num_mini_fat_sectors = read_u32(bytes, 0x40)? as usize;
        let first_difat_sector = read_u32(bytes, 0x44)?;
        let num_difat_sectors = read_u32(bytes, 0x48)? as usize;
        let mut difat = Vec::new();
        for index in 0..109usize {
            let sector = read_u32(bytes, 0x4C + index * 4)?;
            if sector != FREE_SECTOR {
                difat.push(sector);
            }
        }
        let entries_per_difat_sector = sector_size / 4 - 1;
        let mut next_difat = first_difat_sector;
        for _ in 0..num_difat_sectors {
            if next_difat == END_OF_CHAIN || next_difat == FREE_SECTOR {
                break;
            }
            let sector = read_sector(bytes, sector_size, next_difat)?;
            for index in 0..entries_per_difat_sector {
                let sector_id = read_u32(sector, index * 4)?;
                if sector_id != FREE_SECTOR {
                    difat.push(sector_id);
                }
            }
            next_difat = read_u32(sector, sector_size - 4)?;
        }
        if difat.len() < num_fat_sectors {
            return Err(LoError::Parse(format!(
                "CFB FAT list too short: expected {num_fat_sectors}, found {}",
                difat.len()
            )));
        }
        let mut fat = Vec::new();
        for &fat_sector in difat.iter().take(num_fat_sectors) {
            let sector = read_sector(bytes, sector_size, fat_sector)?;
            for chunk in sector.chunks_exact(4) {
                fat.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
            }
        }
        let directory_bytes = read_chain(bytes, sector_size, &fat, first_dir_sector, None)?;
        let mut directory = Vec::new();
        for entry in directory_bytes.chunks_exact(128) {
            let name_len = read_u16(entry, 0x40)? as usize;
            let object_type = entry[0x42];
            if object_type == 0 {
                continue;
            }
            let raw_name = if (2..=64).contains(&name_len) {
                &entry[..name_len - 2]
            } else {
                &entry[..0]
            };
            let mut utf16 = Vec::new();
            for chunk in raw_name.chunks_exact(2) {
                utf16.push(u16::from_le_bytes([chunk[0], chunk[1]]));
            }
            let name = String::from_utf16(&utf16)
                .unwrap_or_else(|_| String::from_utf8_lossy(raw_name).to_string())
                .trim_end_matches('\0')
                .to_string();
            let starting_sector = read_u32(entry, 0x74)?;
            let stream_size = if major_version == 3 {
                read_u32(entry, 0x78)? as usize
            } else {
                read_u64(entry, 0x78)? as usize
            };
            directory.push(DirectoryEntry {
                name,
                object_type,
                starting_sector,
                stream_size,
            });
        }
        let root = directory
            .iter()
            .find(|entry| entry.object_type == 5)
            .ok_or_else(|| LoError::InvalidInput("CFB Root Entry not found".to_string()))?
            .clone();
        let mini_fat_bytes = if num_mini_fat_sectors > 0 {
            read_chain(bytes, sector_size, &fat, first_mini_fat_sector, None)?
        } else {
            Vec::new()
        };
        let mut mini_fat = Vec::new();
        for chunk in mini_fat_bytes.chunks_exact(4) {
            mini_fat.push(u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
        }
        let mini_stream = if root.stream_size > 0 && root.starting_sector != END_OF_CHAIN {
            read_chain(
                bytes,
                sector_size,
                &fat,
                root.starting_sector,
                Some(root.stream_size),
            )?
        } else {
            Vec::new()
        };
        Ok(Self {
            bytes: bytes.to_vec(),
            sector_size,
            mini_sector_size,
            mini_stream_cutoff,
            fat,
            mini_fat,
            directory,
            mini_stream,
        })
    }

    fn read_stream(&self, name: &str) -> Result<Vec<u8>> {
        let entry = self
            .directory
            .iter()
            .find(|entry| entry.object_type == 2 && entry.name.eq_ignore_ascii_case(name))
            .ok_or_else(|| LoError::InvalidInput(format!("CFB stream not found: {name}")))?;
        if entry.stream_size < self.mini_stream_cutoff {
            self.read_mini_stream(entry.starting_sector, entry.stream_size)
        } else {
            read_chain(
                &self.bytes,
                self.sector_size,
                &self.fat,
                entry.starting_sector,
                Some(entry.stream_size),
            )
        }
    }

    fn read_mini_stream(&self, start_sector: u32, size: usize) -> Result<Vec<u8>> {
        if size == 0 {
            return Ok(Vec::new());
        }
        if self.mini_stream.is_empty() {
            return Err(LoError::Parse("CFB mini stream is empty".to_string()));
        }
        let mut sector = start_sector;
        let mut out = Vec::new();
        let mut seen = HashSet::new();
        while sector != END_OF_CHAIN && sector != FREE_SECTOR {
            if !seen.insert(sector) {
                return Err(LoError::Parse("CFB mini FAT cycle detected".to_string()));
            }
            let offset = sector as usize * self.mini_sector_size;
            let end = offset + self.mini_sector_size;
            if end > self.mini_stream.len() {
                return Err(LoError::Parse("CFB mini sector out of bounds".to_string()));
            }
            out.extend_from_slice(&self.mini_stream[offset..end]);
            if out.len() >= size {
                out.truncate(size);
                break;
            }
            sector = *self
                .mini_fat
                .get(sector as usize)
                .ok_or_else(|| LoError::Parse("CFB mini FAT reference out of range".to_string()))?;
        }
        Ok(out)
    }
}

fn locate_clx(word_document: &[u8], table: &[u8]) -> Result<(usize, usize)> {
    if word_document.len() >= 0x1AA {
        let fc = read_u32(word_document, 0x1A2)? as usize;
        let lcb = read_u32(word_document, 0x1A6)? as usize;
        if looks_like_clx(table, fc, lcb) {
            return Ok((fc, lcb));
        }
    }
    let limit = word_document.len().min(0x400);
    for offset in (0x80..limit.saturating_sub(8)).step_by(2) {
        let fc = read_u32(word_document, offset)? as usize;
        let lcb = read_u32(word_document, offset + 4)? as usize;
        if looks_like_clx(table, fc, lcb) {
            return Ok((fc, lcb));
        }
    }
    Err(LoError::Parse(
        "failed to locate the DOC piece table (CLX)".to_string(),
    ))
}

fn looks_like_clx(table: &[u8], fc: usize, lcb: usize) -> bool {
    if lcb < 5 {
        return false;
    }
    let Some(end) = fc.checked_add(lcb) else {
        return false;
    };
    if end > table.len() {
        return false;
    }
    let clx = &table[fc..end];
    let mut index = 0usize;
    while index < clx.len() {
        match clx[index] {
            0x01 => {
                if index + 3 > clx.len() {
                    return false;
                }
                let size = u16::from_le_bytes([clx[index + 1], clx[index + 2]]) as usize;
                index += 3 + size;
            }
            0x02 => return index + 5 <= clx.len(),
            _ => return false,
        }
    }
    false
}

fn parse_clx(clx: &[u8]) -> Result<Vec<Piece>> {
    let mut index = 0usize;
    while index < clx.len() {
        match clx[index] {
            0x01 => {
                if index + 3 > clx.len() {
                    return Err(LoError::Parse("DOC CLX has a truncated RgPrc".to_string()));
                }
                let size = u16::from_le_bytes([clx[index + 1], clx[index + 2]]) as usize;
                index += 3 + size;
            }
            0x02 => {
                let lcb = read_u32(clx, index + 1)? as usize;
                let start = index + 5;
                let end = start
                    .checked_add(lcb)
                    .ok_or_else(|| LoError::Parse("DOC PlcPcd overflow".to_string()))?;
                if end > clx.len() {
                    return Err(LoError::Parse("DOC PlcPcd exceeds CLX bounds".to_string()));
                }
                return parse_plcpcd(&clx[start..end]);
            }
            other => {
                return Err(LoError::Parse(format!(
                    "unexpected DOC CLX tag 0x{other:02X}"
                )))
            }
        }
    }
    Err(LoError::Parse(
        "DOC CLX did not contain a piece table".to_string(),
    ))
}

fn parse_plcpcd(bytes: &[u8]) -> Result<Vec<Piece>> {
    if bytes.len() < 4 || (bytes.len() - 4) % 12 != 0 {
        return Err(LoError::Parse("DOC PlcPcd has an invalid size".to_string()));
    }
    let piece_count = (bytes.len() - 4) / 12;
    let cp_count = piece_count + 1;
    let pcd_offset = cp_count * 4;
    let mut pieces = Vec::new();
    for index in 0..piece_count {
        let cp_start = read_u32(bytes, index * 4)? as usize;
        let cp_end = read_u32(bytes, (index + 1) * 4)? as usize;
        let raw_fc = read_u32(bytes, pcd_offset + index * 8 + 2)?;
        pieces.push(Piece {
            cp_start,
            cp_end,
            fc: raw_fc & 0x3FFF_FFFF,
            compressed: (raw_fc & 0x4000_0000) != 0,
        });
    }
    Ok(pieces)
}

fn extract_piece_text(word_document: &[u8], pieces: &[Piece]) -> Result<String> {
    let mut out = String::new();
    for piece in pieces {
        if piece.cp_end <= piece.cp_start {
            continue;
        }
        let chars = piece.cp_end - piece.cp_start;
        if piece.compressed {
            let start = (piece.fc / 2) as usize;
            let end = start
                .checked_add(chars)
                .ok_or_else(|| LoError::Parse("DOC compressed piece overflow".to_string()))?;
            if end > word_document.len() {
                return Err(LoError::Parse(
                    "DOC compressed piece exceeds WordDocument bounds".to_string(),
                ));
            }
            out.push_str(&decode_compressed_text(&word_document[start..end]));
        } else {
            let start = piece.fc as usize;
            let byte_len = chars
                .checked_mul(2)
                .ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
            let end = start
                .checked_add(byte_len)
                .ok_or_else(|| LoError::Parse("DOC unicode piece overflow".to_string()))?;
            if end > word_document.len() {
                return Err(LoError::Parse(
                    "DOC unicode piece exceeds WordDocument bounds".to_string(),
                ));
            }
            out.push_str(&decode_utf16le(&word_document[start..end]));
        }
    }
    Ok(out)
}

fn decode_utf16le(bytes: &[u8]) -> String {
    let mut data = Vec::new();
    for chunk in bytes.chunks_exact(2) {
        data.push(u16::from_le_bytes([chunk[0], chunk[1]]));
    }
    String::from_utf16_lossy(&data)
}

fn decode_compressed_text(bytes: &[u8]) -> String {
    let mut out = String::new();
    for &byte in bytes {
        let ch = match byte {
            0x80 => '\u{20AC}',
            0x82 => '\u{201A}',
            0x83 => '\u{0192}',
            0x84 => '\u{201E}',
            0x85 => '\u{2026}',
            0x86 => '\u{2020}',
            0x87 => '\u{2021}',
            0x88 => '\u{02C6}',
            0x89 => '\u{2030}',
            0x8A => '\u{0160}',
            0x8B => '\u{2039}',
            0x8C => '\u{0152}',
            0x8E => '\u{017D}',
            0x91 => '\u{2018}',
            0x92 => '\u{2019}',
            0x93 => '\u{201C}',
            0x94 => '\u{201D}',
            0x95 => '\u{2022}',
            0x96 => '\u{2013}',
            0x97 => '\u{2014}',
            0x98 => '\u{02DC}',
            0x99 => '\u{2122}',
            0x9A => '\u{0161}',
            0x9B => '\u{203A}',
            0x9C => '\u{0153}',
            0x9E => '\u{017E}',
            0x9F => '\u{0178}',
            value => value as char,
        };
        out.push(ch);
    }
    out
}

fn normalize_doc_text(raw: &str) -> String {
    let mut out = String::new();
    let mut last_was_space = false;
    for ch in raw.chars() {
        match ch {
            '\r' => {
                if !out.ends_with("\n\n") {
                    out.push_str("\n\n");
                }
                last_was_space = false;
            }
            '\n' | '\u{000B}' | '\u{000C}' => {
                if !out.ends_with('\n') {
                    out.push('\n');
                }
                last_was_space = false;
            }
            '\u{0007}' => {
                if !out.ends_with('\t') {
                    out.push('\t');
                }
                last_was_space = false;
            }
            '\u{0013}' | '\u{0014}' | '\u{0015}' | '\u{0000}' => {}
            ch if ch.is_control() && ch != '\t' => {}
            ch if ch.is_whitespace() => {
                if !last_was_space {
                    out.push(' ');
                    last_was_space = true;
                }
            }
            other => {
                out.push(other);
                last_was_space = false;
            }
        }
    }
    let mut cleaned = String::new();
    let mut blank_run = 0usize;
    for line in out.lines() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            blank_run += 1;
            if blank_run <= 1 && !cleaned.is_empty() && !cleaned.ends_with("\n\n") {
                cleaned.push_str("\n\n");
            }
        } else {
            blank_run = 0;
            if !cleaned.is_empty() && !cleaned.ends_with("\n\n") {
                cleaned.push_str("\n\n");
            }
            cleaned.push_str(trimmed);
        }
    }
    cleaned.trim().to_string()
}

fn read_chain(
    bytes: &[u8],
    sector_size: usize,
    fat: &[u32],
    start_sector: u32,
    size: Option<usize>,
) -> Result<Vec<u8>> {
    if start_sector == END_OF_CHAIN || start_sector == FREE_SECTOR {
        return Ok(Vec::new());
    }
    let mut out = Vec::new();
    let mut seen = HashSet::new();
    let mut sector = start_sector;
    while sector != END_OF_CHAIN && sector != FREE_SECTOR {
        if sector == FAT_SECTOR || sector == DIFAT_SECTOR {
            return Err(LoError::Parse(
                "CFB stream chain points to a reserved sector".to_string(),
            ));
        }
        if !seen.insert(sector) {
            return Err(LoError::Parse("CFB FAT cycle detected".to_string()));
        }
        let data = read_sector(bytes, sector_size, sector)?;
        out.extend_from_slice(data);
        if let Some(limit) = size {
            if out.len() >= limit {
                out.truncate(limit);
                break;
            }
        }
        sector = *fat
            .get(sector as usize)
            .ok_or_else(|| LoError::Parse("CFB FAT reference out of range".to_string()))?;
    }
    Ok(out)
}

fn read_sector(bytes: &[u8], sector_size: usize, sector_id: u32) -> Result<&[u8]> {
    let offset = (sector_id as usize + 1)
        .checked_mul(sector_size)
        .ok_or_else(|| LoError::Parse("CFB sector offset overflow".to_string()))?;
    let end = offset
        .checked_add(sector_size)
        .ok_or_else(|| LoError::Parse("CFB sector end overflow".to_string()))?;
    if end > bytes.len() {
        return Err(LoError::Parse(
            "CFB sector extends past the file end".to_string(),
        ));
    }
    Ok(&bytes[offset..end])
}

fn read_u16(bytes: &[u8], offset: usize) -> Result<u16> {
    let slice = bytes
        .get(offset..offset + 2)
        .ok_or_else(|| LoError::Parse("unexpected EOF while reading u16".to_string()))?;
    Ok(u16::from_le_bytes([slice[0], slice[1]]))
}
fn read_u32(bytes: &[u8], offset: usize) -> Result<u32> {
    let slice = bytes
        .get(offset..offset + 4)
        .ok_or_else(|| LoError::Parse("unexpected EOF while reading u32".to_string()))?;
    Ok(u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]]))
}
fn read_u64(bytes: &[u8], offset: usize) -> Result<u64> {
    let slice = bytes
        .get(offset..offset + 8)
        .ok_or_else(|| LoError::Parse("unexpected EOF while reading u64".to_string()))?;
    Ok(u64::from_le_bytes([
        slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
    ]))
}