use super::error::{DocError, Result};
#[derive(Debug, Clone)]
pub struct Piece {
pub cp_start: u32,
pub cp_end: u32,
pub fc: u32,
pub is_compressed: bool,
}
pub fn parse_clx(data: &[u8]) -> Result<Vec<Piece>> {
let mut pos = 0;
while pos < data.len() && data[pos] == 0x01 {
if pos + 3 > data.len() {
return Err(DocError::InvalidPieceTable("Grpprl truncated".into()));
}
let size = u16::from_le_bytes([data[pos + 1], data[pos + 2]]) as usize;
pos += 3 + size;
}
if pos >= data.len() || data[pos] != 0x02 {
return Err(DocError::InvalidPieceTable(format!(
"expected Pcdt (0x02) at offset {pos}, found {:?}",
data.get(pos)
)));
}
pos += 1;
if pos + 4 > data.len() {
return Err(DocError::InvalidPieceTable("Pcdt size truncated".into()));
}
let pcdt_size =
u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
pos += 4;
if pos + pcdt_size > data.len() {
}
let pcd_data = &data[pos..data.len().min(pos + pcdt_size)];
parse_plc_pcd(pcd_data)
}
fn parse_plc_pcd(data: &[u8]) -> Result<Vec<Piece>> {
if data.len() < 8 {
return Err(DocError::InvalidPieceTable("PlcPcd too small".into()));
}
let n = (data.len() - 4) / 12;
if n == 0 {
return Ok(Vec::new());
}
let cp_array_size = (n + 1) * 4;
if cp_array_size + n * 8 > data.len() {
return Err(DocError::InvalidPieceTable("PlcPcd size mismatch".into()));
}
let mut pieces = Vec::with_capacity(n);
for i in 0..n {
let cp_start = u32::from_le_bytes([
data[i * 4],
data[i * 4 + 1],
data[i * 4 + 2],
data[i * 4 + 3],
]);
let cp_end = u32::from_le_bytes([
data[(i + 1) * 4],
data[(i + 1) * 4 + 1],
data[(i + 1) * 4 + 2],
data[(i + 1) * 4 + 3],
]);
let pcd_offset = cp_array_size + i * 8;
let fc = u32::from_le_bytes([
data[pcd_offset + 2],
data[pcd_offset + 3],
data[pcd_offset + 4],
data[pcd_offset + 5],
]);
let is_compressed = (fc & 0x40000000) != 0;
pieces.push(Piece {
cp_start,
cp_end,
fc,
is_compressed,
});
}
Ok(pieces)
}
pub fn extract_text(word_doc: &[u8], pieces: &[Piece], max_chars: u32) -> String {
let mut text = String::new();
for piece in pieces {
if piece.cp_start >= max_chars {
break;
}
let char_count = piece.cp_end.min(max_chars) - piece.cp_start;
if piece.is_compressed {
let byte_offset = ((piece.fc & !0x40000000) / 2) as usize;
let byte_count = char_count as usize;
if byte_offset + byte_count <= word_doc.len() {
for &b in &word_doc[byte_offset..byte_offset + byte_count] {
text.push(cp1252_to_char(b));
}
}
} else {
let byte_offset = piece.fc as usize;
let byte_count = char_count as usize * 2;
if byte_offset + byte_count <= word_doc.len() {
let chars: Vec<u16> = (0..char_count as usize)
.map(|i| {
let o = byte_offset + i * 2;
u16::from_le_bytes([word_doc[o], word_doc[o + 1]])
})
.collect();
text.push_str(&String::from_utf16_lossy(&chars));
}
}
}
text
}
fn cp1252_to_char(b: u8) -> char {
match b {
0x80 => '\u{20AC}', 0x82 => '\u{201A}', 0x83 => '\u{0192}', 0x84 => '\u{201E}', 0x85 => '\u{2026}', 0x86 => '\u{2020}', 0x87 => '\u{2021}', 0x88 => '\u{02C6}', 0x89 => '\u{2030}', 0x8A => '\u{0160}', 0x8B => '\u{2039}', 0x8C => '\u{0152}', 0x8E => '\u{017D}', 0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '\u{201C}', 0x94 => '\u{201D}', 0x95 => '\u{2022}', 0x96 => '\u{2013}', 0x97 => '\u{2014}', 0x98 => '\u{02DC}', 0x99 => '\u{2122}', 0x9A => '\u{0161}', 0x9B => '\u{203A}', 0x9C => '\u{0153}', 0x9E => '\u{017E}', 0x9F => '\u{0178}', _ => b as char,
}
}
pub fn sanitize_text(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for ch in text.chars() {
match ch {
'\r' => result.push('\n'), '\x07' => result.push('\t'), '\x0C' => result.push('\n'), '\x0B' => result.push('\n'), '\x01' | '\x08' | '\x13' | '\x14' | '\x15' => {}, _ => result.push(ch),
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_clx_with_one_piece() {
let mut clx = Vec::new();
clx.push(0x02);
clx.extend_from_slice(&16u32.to_le_bytes());
clx.extend_from_slice(&0u32.to_le_bytes());
clx.extend_from_slice(&10u32.to_le_bytes());
clx.extend_from_slice(&0u16.to_le_bytes());
clx.extend_from_slice(&0x40000100u32.to_le_bytes());
clx.extend_from_slice(&0u16.to_le_bytes());
let pieces = parse_clx(&clx).unwrap();
assert_eq!(pieces.len(), 1);
assert_eq!(pieces[0].cp_start, 0);
assert_eq!(pieces[0].cp_end, 10);
assert!(pieces[0].is_compressed);
}
#[test]
fn parse_clx_with_grpprl_prefix() {
let mut clx = Vec::new();
clx.push(0x01);
clx.extend_from_slice(&3u16.to_le_bytes());
clx.extend_from_slice(&[0, 0, 0]);
clx.push(0x02);
clx.extend_from_slice(&16u32.to_le_bytes());
clx.extend_from_slice(&0u32.to_le_bytes());
clx.extend_from_slice(&5u32.to_le_bytes());
clx.extend_from_slice(&0u16.to_le_bytes());
clx.extend_from_slice(&0x40000000u32.to_le_bytes());
clx.extend_from_slice(&0u16.to_le_bytes());
let pieces = parse_clx(&clx).unwrap();
assert_eq!(pieces.len(), 1);
assert_eq!(pieces[0].cp_end, 5);
}
#[test]
fn extract_compressed_text() {
let mut word_doc = vec![0u8; 256];
let text_offset = 0x80;
word_doc[text_offset..text_offset + 5].copy_from_slice(b"Hello");
let pieces = vec![Piece {
cp_start: 0,
cp_end: 5,
fc: 0x40000100, is_compressed: true,
}];
let text = extract_text(&word_doc, &pieces, 5);
assert_eq!(text, "Hello");
}
#[test]
fn extract_unicode_text() {
let mut word_doc = vec![0u8; 256];
let fc = 100u32;
word_doc[100] = b'H';
word_doc[101] = 0;
word_doc[102] = b'i';
word_doc[103] = 0;
let pieces = vec![Piece {
cp_start: 0,
cp_end: 2,
fc,
is_compressed: false,
}];
let text = extract_text(&word_doc, &pieces, 2);
assert_eq!(text, "Hi");
}
#[test]
fn extract_multiple_pieces() {
let mut word_doc = vec![0u8; 512];
word_doc[0x80] = b'A';
word_doc[0x81] = b'B';
word_doc[0x90] = b'C';
word_doc[0x91] = b'D';
let pieces = vec![
Piece {
cp_start: 0,
cp_end: 2,
fc: 0x40000100, is_compressed: true,
},
Piece {
cp_start: 2,
cp_end: 4,
fc: 0x40000120, is_compressed: true,
},
];
let text = extract_text(&word_doc, &pieces, 4);
assert_eq!(text, "ABCD");
}
#[test]
fn sanitize_paragraph_marks() {
assert_eq!(sanitize_text("Hello\rWorld"), "Hello\nWorld");
assert_eq!(sanitize_text("A\x0CB"), "A\nB");
assert_eq!(sanitize_text("A\x07B"), "A\tB");
}
#[test]
fn sanitize_field_codes_stripped() {
assert_eq!(sanitize_text("before\x13FIELD\x14result\x15after"), "beforeFIELDresultafter");
}
#[test]
fn cp1252_special_chars() {
assert_eq!(cp1252_to_char(0x80), '€');
assert_eq!(cp1252_to_char(0x93), '\u{201C}');
assert_eq!(cp1252_to_char(0x94), '\u{201D}');
assert_eq!(cp1252_to_char(0x41), 'A');
}
#[test]
fn max_chars_limits_output() {
let mut word_doc = vec![0u8; 256];
word_doc[0x80..0x85].copy_from_slice(b"Hello");
let pieces = vec![Piece {
cp_start: 0,
cp_end: 5,
fc: 0x40000100,
is_compressed: true,
}];
let text = extract_text(&word_doc, &pieces, 3);
assert_eq!(text, "Hel");
}
}