use super::super::package::{DocError, Result};
use super::fib::FileInformationBlock;
use crate::ole::binary::{read_u16_le, read_u32_le};
pub const PIECE_DESCRIPTOR_SIZE: usize = 8;
pub struct TextExtractor {
text: String,
}
impl TextExtractor {
pub fn new(
fib: &FileInformationBlock,
word_document: &[u8],
table_stream: &[u8],
) -> Result<Self> {
let text = Self::extract_text_from_pieces(fib, word_document, table_stream)?;
Ok(Self { text })
}
pub fn extract_all_text(&self) -> Result<String> {
Ok(self.text.clone())
}
fn extract_text_from_pieces(
fib: &FileInformationBlock,
word_document: &[u8],
table_stream: &[u8],
) -> Result<String> {
let (clx_offset, clx_length) = fib
.get_table_pointer(33)
.ok_or_else(|| DocError::Corrupted("CLX pointer not found in FIB".to_string()))?;
if clx_length == 0 {
return Self::extract_text_simple(word_document);
}
let clx_offset = clx_offset as usize;
let clx_length = clx_length as usize;
if clx_offset >= table_stream.len() {
return Err(DocError::Corrupted(format!(
"CLX offset {} is beyond table stream length {}",
clx_offset, table_stream.len()
)));
}
if clx_offset + clx_length > table_stream.len() {
return Err(DocError::Corrupted(format!(
"CLX extends beyond table stream: offset={}, length={}, stream_len={}",
clx_offset, clx_length, table_stream.len()
)));
}
let clx_data = &table_stream[clx_offset..clx_offset + clx_length];
match Self::parse_piece_table(clx_data, word_document) {
Ok(text) if !text.is_empty() => Ok(text),
_ => {
Self::extract_text_simple(word_document)
}
}
}
fn parse_piece_table(clx_data: &[u8], word_document: &[u8]) -> Result<String> {
let mut offset = 0;
while offset < clx_data.len() {
if offset >= clx_data.len() {
return Err(DocError::Corrupted("Unexpected end of CLX data".to_string()));
}
let section_type = clx_data[offset];
offset += 1;
match section_type {
0x01 => {
if offset + 2 > clx_data.len() {
return Err(DocError::Corrupted("GRPPR L section truncated".to_string()));
}
let size = read_u16_le(clx_data, offset).unwrap_or(0) as usize;
offset += 2 + size;
}
0x02 => {
if offset + 4 > clx_data.len() {
return Err(DocError::Corrupted("Piece table size field truncated".to_string()));
}
let piece_table_size = read_u32_le(clx_data, offset).unwrap_or(0) as usize;
offset += 4;
if offset + piece_table_size > clx_data.len() {
return Err(DocError::Corrupted("Piece table data truncated".to_string()));
}
let piece_table_data = &clx_data[offset..offset + piece_table_size];
let pieces = Self::parse_plex_of_cps(piece_table_data)?;
return Self::extract_text_from_piece_descriptors(&pieces, word_document);
}
0x14 => {
if offset + 2 > clx_data.len() {
return Err(DocError::Corrupted("Document Properties section truncated".to_string()));
}
let size = read_u16_le(clx_data, offset).unwrap_or(0) as usize;
offset += 2 + size;
}
_ => {
if offset + 2 <= clx_data.len() {
let size = read_u16_le(clx_data, offset).unwrap_or(0) as usize;
offset += 2 + size;
} else {
return Err(DocError::Corrupted(format!(
"Unexpected CLX section type 0x{:02X} at end of data", section_type
)));
}
}
}
}
Ok(String::new())
}
fn parse_plex_of_cps(plex_data: &[u8]) -> Result<Vec<PieceDescriptor>> {
if plex_data.len() < 4 {
return Ok(Vec::new());
}
let num_pieces = (plex_data.len() - 4) / (4 + PIECE_DESCRIPTOR_SIZE);
if num_pieces == 0 {
return Ok(Vec::new());
}
let expected_size = 4 + num_pieces * (4 + PIECE_DESCRIPTOR_SIZE);
if plex_data.len() < expected_size {
return Err(DocError::Corrupted(format!(
"PlexOfCps truncated: expected {} bytes, got {}",
expected_size, plex_data.len()
)));
}
let mut pieces = Vec::with_capacity(num_pieces);
let mut cps = Vec::with_capacity(num_pieces + 1);
for i in 0..=num_pieces {
let offset = i * 4;
let cp = read_u32_le(plex_data, offset).unwrap_or(0);
cps.push(cp);
}
let struct_offset = (num_pieces + 1) * 4;
for i in 0..num_pieces {
let offset = struct_offset + i * PIECE_DESCRIPTOR_SIZE;
if offset + PIECE_DESCRIPTOR_SIZE > plex_data.len() {
return Err(DocError::Corrupted(format!(
"PieceDescriptor {} truncated", i
)));
}
let piece_data = &plex_data[offset..offset + PIECE_DESCRIPTOR_SIZE];
let _descriptor = read_u16_le(piece_data, 0).unwrap_or(0);
let mut fc = read_u32_le(piece_data, 2).unwrap_or(0);
let _prm = read_u16_le(piece_data, 6).unwrap_or(0);
let is_ansi = (fc & 0x40000000) != 0;
if is_ansi {
fc &= !0x40000000; fc /= 2; }
pieces.push(PieceDescriptor {
cp_start: cps[i],
cp_end: cps[i + 1],
file_pos: fc as usize,
is_ansi,
});
}
Ok(pieces)
}
fn extract_text_from_piece_descriptors(
pieces: &[PieceDescriptor],
word_document: &[u8],
) -> Result<String> {
let mut text = String::new();
for piece in pieces {
let char_count = (piece.cp_end - piece.cp_start) as usize;
if char_count == 0 {
continue; }
let byte_count = if piece.is_ansi {
char_count } else {
char_count * 2 };
let start = piece.file_pos;
let end = start + byte_count;
if start >= word_document.len() {
eprintln!("Warning: Piece file position {} beyond document length {}", start, word_document.len());
continue;
}
if end > word_document.len() {
eprintln!("Warning: Piece extends beyond document: start={}, end={}, doc_len={}", start, end, word_document.len());
let available_end = word_document.len();
if start >= available_end {
continue;
}
}
let actual_end = end.min(word_document.len());
let text_data = &word_document[start..actual_end];
if piece.is_ansi {
for &byte in text_data {
text.push(windows_1252_to_char(byte));
}
} else {
let utf16_data = if text_data.len().is_multiple_of(2) {
text_data
} else {
&text_data[..text_data.len() & !1] };
for chunk in utf16_data.chunks_exact(2) {
let code_unit = read_u16_le(chunk, 0).unwrap_or(0);
if let Some(ch) = char::from_u32(code_unit as u32) {
text.push(ch);
}
}
}
}
Ok(text)
}
fn extract_text_simple(word_document: &[u8]) -> Result<String> {
let start_offset = 0x200;
if word_document.len() <= start_offset {
return Ok(String::new());
}
let mut text = String::new();
for &byte in &word_document[start_offset..] {
if byte == 0 {
break;
}
text.push(windows_1252_to_char(byte));
}
Ok(text)
}
}
#[derive(Debug, Clone)]
struct PieceDescriptor {
cp_start: u32,
cp_end: u32,
file_pos: usize,
is_ansi: bool,
}
fn windows_1252_to_char(byte: u8) -> char {
match byte {
0x80 => '€',
0x82 => '‚',
0x83 => 'ƒ',
0x84 => '„',
0x85 => '…',
0x86 => '†',
0x87 => '‡',
0x88 => 'ˆ',
0x89 => '‰',
0x8A => 'Š',
0x8B => '‹',
0x8C => 'Œ',
0x8E => 'Ž',
0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '"',
0x94 => '"',
0x95 => '•',
0x96 => '–',
0x97 => '—',
0x98 => '˜',
0x99 => '™',
0x9A => 'š',
0x9B => '›',
0x9C => 'œ',
0x9E => 'ž',
0x9F => 'Ÿ',
_ => byte as char, }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_windows_1252_conversion() {
assert_eq!(windows_1252_to_char(0x41), 'A');
assert_eq!(windows_1252_to_char(0x80), '€');
assert_eq!(windows_1252_to_char(0x93), '"');
assert_eq!(windows_1252_to_char(0x94), '"');
}
#[test]
fn test_clx_parsing_structure() {
assert_eq!(PIECE_DESCRIPTOR_SIZE, 8);
let minimal_plex = [
0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ];
let result = TextExtractor::parse_plex_of_cps(&minimal_plex);
assert!(result.is_ok());
let pieces = result.unwrap();
assert_eq!(pieces.len(), 1);
assert_eq!(pieces[0].cp_start, 0);
assert_eq!(pieces[0].cp_end, 16);
assert_eq!(pieces[0].file_pos, 0);
assert!(!pieces[0].is_ansi); }
}