litchi 0.0.1

High-performance parser for Microsoft Office, OpenDocument, and Apple iWork file formats with unified API
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
/// Text extraction from DOC files.
///
/// This module handles extracting text from the binary structures in DOC files.
/// Text in DOC files is stored in a complex way:
/// - The actual text bytes are in the WordDocument stream
/// - A "Piece Table" (CLX structure) in the Table stream maps character positions to file positions
/// - Text can be in either 8-bit (Windows-1252) or 16-bit (UTF-16LE) format
use super::super::package::{DocError, Result};
use super::fib::FileInformationBlock;
use crate::ole::binary::{read_u16_le, read_u32_le};

/// Size of a PieceDescriptor in bytes (8 bytes as per Apache POI)
pub const PIECE_DESCRIPTOR_SIZE: usize = 8;

/// CLX (Compound Line Extension) parsing utilities.
///
/// Based on Apache POI's PlexOfCps implementation, the CLX structure is a
/// Property List with Character Positions (PLCF) that contains:
/// - 4-byte count of entries
/// - For each entry: 4 bytes (CP start) + 4 bytes (CP end) + 8 bytes (PieceDescriptor)
/// - Total: (count + 1) * 4 + count * 8 bytes
///
/// Text extractor for DOC files.
///
/// Handles the complex text extraction process from DOC binary structures.
pub struct TextExtractor {
    /// The extracted text
    text: String,
}

impl TextExtractor {
    /// Create a new TextExtractor and extract text.
    ///
    /// # Arguments
    ///
    /// * `fib` - The File Information Block
    /// * `word_document` - The WordDocument stream
    /// * `table_stream` - The Table stream (0Table or 1Table)
    pub fn new(
        fib: &FileInformationBlock,
        word_document: &[u8],
        table_stream: &[u8],
    ) -> Result<Self> {
        // Extract text using the piece table
        let text = Self::extract_text_from_pieces(fib, word_document, table_stream)?;

        Ok(Self { text })
    }

    /// Extract all text from the document.
    pub fn extract_all_text(&self) -> Result<String> {
        Ok(self.text.clone())
    }

    /// Extract text using the piece table (CLX structure).
    ///
    /// The piece table maps character positions in the document to file positions
    /// in the WordDocument stream. This allows Word to efficiently handle insertions
    /// and deletions without rewriting the entire file.
    fn extract_text_from_pieces(
        fib: &FileInformationBlock,
        word_document: &[u8],
        table_stream: &[u8],
    ) -> Result<String> {
        // Get the CLX (piece table) location from FIB
        // CLX is at FibRgFcLcb index 33 (fcClx, lcbClx) according to Apache POI's FIBFieldHandler
        let (clx_offset, clx_length) = fib
            .get_table_pointer(33)
            .ok_or_else(|| DocError::Corrupted("CLX pointer not found in FIB".to_string()))?;

        if clx_length == 0 {
            // No piece table means text starts at offset 0x200 or 0x800
            // This is a simplified doc or Word 6.0 format
            return Self::extract_text_simple(word_document);
        }

        // Extract the CLX from the table stream
        let clx_offset = clx_offset as usize;
        let clx_length = clx_length as usize;

        if clx_offset >= table_stream.len() {
            return Err(DocError::Corrupted(format!(
                "CLX offset {} is beyond table stream length {}",
                clx_offset, table_stream.len()
            )));
        }

        if clx_offset + clx_length > table_stream.len() {
            return Err(DocError::Corrupted(format!(
                "CLX extends beyond table stream: offset={}, length={}, stream_len={}",
                clx_offset, clx_length, table_stream.len()
            )));
        }

        let clx_data = &table_stream[clx_offset..clx_offset + clx_length];


        // Try to parse the piece table from CLX
        match Self::parse_piece_table(clx_data, word_document) {
            Ok(text) if !text.is_empty() => Ok(text),
            _ => {
                // If CLX parsing fails or returns empty, fall back to simple text extraction
                Self::extract_text_simple(word_document)
            }
        }
    }

    /// Parse the piece table from CLX data using Apache POI's ComplexFileTable logic.
    ///
    /// The CLX structure follows Apache POI's ComplexFileTable format:
    /// - Optional GRPPR L sections (type 0x01) for fast-saved files
    /// - TEXT_PIECE_TABLE_TYPE marker (0x02)
    /// - 4-byte size of the piece table data
    /// - The piece table data itself (PlexOfCps structure)
    fn parse_piece_table(clx_data: &[u8], word_document: &[u8]) -> Result<String> {
        let mut offset = 0;

        // Skip GRPPR L sections (type 0x01) until we find the piece table
        while offset < clx_data.len() {
            if offset >= clx_data.len() {
                return Err(DocError::Corrupted("Unexpected end of CLX data".to_string()));
            }

            let section_type = clx_data[offset];
            offset += 1;

            match section_type {
                0x01 => {
                    // GRPPR L section - skip it
                    if offset + 2 > clx_data.len() {
                        return Err(DocError::Corrupted("GRPPR L section truncated".to_string()));
                    }

                    let size = read_u16_le(clx_data, offset).unwrap_or(0) as usize;
                    offset += 2 + size;
                }
                0x02 => {
                    // TEXT_PIECE_TABLE_TYPE - this is the piece table
                    if offset + 4 > clx_data.len() {
                        return Err(DocError::Corrupted("Piece table size field truncated".to_string()));
                    }

                    let piece_table_size = read_u32_le(clx_data, offset).unwrap_or(0) as usize;
                    offset += 4;

                    if offset + piece_table_size > clx_data.len() {
                        return Err(DocError::Corrupted("Piece table data truncated".to_string()));
                    }

                    let piece_table_data = &clx_data[offset..offset + piece_table_size];

                    // Parse the piece table using PlexOfCps logic
                    let pieces = Self::parse_plex_of_cps(piece_table_data)?;

                    // Extract text from the parsed pieces
                    return Self::extract_text_from_piece_descriptors(&pieces, word_document);
                }
                0x14 => {
                    // Document Properties Descriptor - contains document-wide properties
                    if offset + 2 > clx_data.len() {
                        return Err(DocError::Corrupted("Document Properties section truncated".to_string()));
                    }

                    let size = read_u16_le(clx_data, offset).unwrap_or(0) as usize;
                    offset += 2 + size;
                }
                _ => {
                    // For unknown section types, try to skip them gracefully
                    if offset + 2 <= clx_data.len() {
                        let size = read_u16_le(clx_data, offset).unwrap_or(0) as usize;
                        offset += 2 + size;
                    } else {
                        return Err(DocError::Corrupted(format!(
                            "Unexpected CLX section type 0x{:02X} at end of data", section_type
                        )));
                    }
                }
            }
        }

        // If we reach here, no piece table was found in the CLX
        // Return empty string to trigger fallback
        Ok(String::new())
    }

    /// Parse a PlexOfCps structure (Property List with Character Positions).
    ///
    /// Based on Apache POI's PlexOfCps implementation:
    /// Format: [CP0] [CP1] ... [CP_n] [Struct0] [Struct1] ... [Struct_{n-1}]
    /// - For n pieces, there are (n+1) CPs (4 bytes each)
    /// - Followed by n PieceDescriptor structs (8 bytes each)
    /// - Number of pieces: n = (size - 4) / (4 + 8)
    fn parse_plex_of_cps(plex_data: &[u8]) -> Result<Vec<PieceDescriptor>> {
        if plex_data.len() < 4 {
            return Ok(Vec::new());
        }

        // Calculate number of pieces using Apache POI's formula:
        // _iMac = (cb - 4) / (4 + cbStruct)
        let num_pieces = (plex_data.len() - 4) / (4 + PIECE_DESCRIPTOR_SIZE);

        if num_pieces == 0 {
            return Ok(Vec::new());
        }

        // Validate size
        let expected_size = 4 + num_pieces * (4 + PIECE_DESCRIPTOR_SIZE);
        if plex_data.len() < expected_size {
            return Err(DocError::Corrupted(format!(
                "PlexOfCps truncated: expected {} bytes, got {}",
                expected_size, plex_data.len()
            )));
        }

        let mut pieces = Vec::with_capacity(num_pieces);

        // Read all CPs first (they're at the beginning)
        let mut cps = Vec::with_capacity(num_pieces + 1);
        for i in 0..=num_pieces {
            let offset = i * 4;
            let cp = read_u32_le(plex_data, offset).unwrap_or(0);
            cps.push(cp);
        }

        // Now read all PieceDescriptors (they're after all the CPs)
        let struct_offset = (num_pieces + 1) * 4;
        for i in 0..num_pieces {
            let offset = struct_offset + i * PIECE_DESCRIPTOR_SIZE;

            if offset + PIECE_DESCRIPTOR_SIZE > plex_data.len() {
                return Err(DocError::Corrupted(format!(
                    "PieceDescriptor {} truncated", i
                )));
            }

            let piece_data = &plex_data[offset..offset + PIECE_DESCRIPTOR_SIZE];

            // Parse PieceDescriptor (matches Apache POI's PieceDescriptor constructor)
            let _descriptor = read_u16_le(piece_data, 0).unwrap_or(0);
            let mut fc = read_u32_le(piece_data, 2).unwrap_or(0);
            let _prm = read_u16_le(piece_data, 6).unwrap_or(0);

            // Extract encoding information from fc (bit 30 indicates encoding)
            // From Apache POI PieceDescriptor.java lines 69-76:
            // If bit 30 is clear, this is Unicode (UTF-16LE)
            // If bit 30 is set, this is compressed ANSI (Windows-1252)
            let is_ansi = (fc & 0x40000000) != 0;
            if is_ansi {
                fc &= !0x40000000; // Clear the encoding bit
                fc /= 2; // Adjust for ANSI (1 byte per character vs 2 bytes for Unicode)
            }

            pieces.push(PieceDescriptor {
                cp_start: cps[i],
                cp_end: cps[i + 1],
                file_pos: fc as usize,
                is_ansi,
            });
        }

        Ok(pieces)
    }



    /// Extract text from piece descriptors.
    ///
    /// Based on Apache POI's TextPieceTable logic, each piece descriptor maps
    /// a range of character positions (CP) to a location in the WordDocument stream.
    fn extract_text_from_piece_descriptors(
        pieces: &[PieceDescriptor],
        word_document: &[u8],
    ) -> Result<String> {
        let mut text = String::new();

        for piece in pieces {
            // Calculate text length in characters from CP range
            let char_count = (piece.cp_end - piece.cp_start) as usize;

            if char_count == 0 {
                continue; // Empty piece
            }

            // Calculate text size in bytes based on encoding
            let byte_count = if piece.is_ansi {
                char_count // 1 byte per character for ANSI
            } else {
                char_count * 2 // 2 bytes per character for Unicode
            };

            // Read the text data from the WordDocument stream
            let start = piece.file_pos;
            let end = start + byte_count;

            if start >= word_document.len() {
                eprintln!("Warning: Piece file position {} beyond document length {}", start, word_document.len());
                continue;
            }

            if end > word_document.len() {
                eprintln!("Warning: Piece extends beyond document: start={}, end={}, doc_len={}", start, end, word_document.len());
                // Try to read what we can
                let available_end = word_document.len();
                if start >= available_end {
                    continue;
                }
            }

            let actual_end = end.min(word_document.len());
            let text_data = &word_document[start..actual_end];

            if piece.is_ansi {
                // 8-bit ANSI text (Windows-1252)
                for &byte in text_data {
                    text.push(windows_1252_to_char(byte));
                }
            } else {
                // 16-bit Unicode (UTF-16LE)
                // Make sure we have complete UTF-16LE pairs
                let utf16_data = if text_data.len().is_multiple_of(2) {
                    text_data
                } else {
                    &text_data[..text_data.len() & !1] // Truncate to even length
                };

                for chunk in utf16_data.chunks_exact(2) {
                    let code_unit = read_u16_le(chunk, 0).unwrap_or(0);
                    if let Some(ch) = char::from_u32(code_unit as u32) {
                        text.push(ch);
                    }
                }
            }
        }

        Ok(text)
    }

    /// Extract text in a simple way (for Word 6.0 or simplified docs).
    ///
    /// In older or simplified DOC files, text may start at a fixed offset
    /// without a piece table.
    fn extract_text_simple(word_document: &[u8]) -> Result<String> {
        // Text typically starts at 0x200 (512) or 0x800 (2048)
        let start_offset = 0x200;

        if word_document.len() <= start_offset {
            return Ok(String::new());
        }

        // Try to extract as Windows-1252
        let mut text = String::new();
        for &byte in &word_document[start_offset..] {
            // Stop at null terminator or control characters
            if byte == 0 {
                break;
            }
            text.push(windows_1252_to_char(byte));
        }

        Ok(text)
    }
}

/// A piece descriptor.
///
/// Maps a range of character positions to a location in the WordDocument stream.
#[derive(Debug, Clone)]
struct PieceDescriptor {
    /// Starting character position
    cp_start: u32,
    /// Ending character position
    cp_end: u32,
    /// File position in WordDocument stream
    file_pos: usize,
    /// Whether text is ANSI (true) or Unicode (false)
    is_ansi: bool,
}

/// Convert a Windows-1252 byte to a Unicode character.
///
/// Windows-1252 is mostly compatible with ISO-8859-1, but has additional
/// printable characters in the 0x80-0x9F range.
fn windows_1252_to_char(byte: u8) -> char {
    match byte {
        0x80 => '',
        0x82 => '',
        0x83 => 'ƒ',
        0x84 => '',
        0x85 => '',
        0x86 => '',
        0x87 => '',
        0x88 => 'ˆ',
        0x89 => '',
        0x8A => 'Š',
        0x8B => '',
        0x8C => 'Œ',
        0x8E => 'Ž',
        0x91 => '\u{2018}', // Left single quotation mark
        0x92 => '\u{2019}', // Right single quotation mark
        0x93 => '"',
        0x94 => '"',
        0x95 => '',
        0x96 => '',
        0x97 => '',
        0x98 => '˜',
        0x99 => '',
        0x9A => 'š',
        0x9B => '',
        0x9C => 'œ',
        0x9E => 'ž',
        0x9F => 'Ÿ',
        _ => byte as char, // For 0x00-0x7F and 0xA0-0xFF, direct conversion works
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_windows_1252_conversion() {
        assert_eq!(windows_1252_to_char(0x41), 'A');
        assert_eq!(windows_1252_to_char(0x80), '');
        assert_eq!(windows_1252_to_char(0x93), '"');
        assert_eq!(windows_1252_to_char(0x94), '"');
    }

    #[test]
    fn test_clx_parsing_structure() {
        // Test that the CLX structure constants are correct
        assert_eq!(PIECE_DESCRIPTOR_SIZE, 8);

        // Test basic parsing of a minimal PlexOfCps structure
        // Format: [CP0] [CP1] [Struct0]
        // For 1 piece: 2 CPs (8 bytes) + 1 PieceDescriptor (8 bytes) = 16 bytes
        let minimal_plex = [
            // CPs (2 of them for 1 piece)
            0x00, 0x00, 0x00, 0x00, // cp_start = 0
            0x10, 0x00, 0x00, 0x00, // cp_end = 16 (16 characters)
            // PieceDescriptor (8 bytes)
            0x00, 0x00,             // descriptor
            0x00, 0x00, 0x00, 0x00, // fc = 0 (Unicode at position 0)
            0x00, 0x00,             // prm
        ];

        // Should parse successfully
        let result = TextExtractor::parse_plex_of_cps(&minimal_plex);
        assert!(result.is_ok());
        let pieces = result.unwrap();
        assert_eq!(pieces.len(), 1);
        assert_eq!(pieces[0].cp_start, 0);
        assert_eq!(pieces[0].cp_end, 16);
        assert_eq!(pieces[0].file_pos, 0);
        assert!(!pieces[0].is_ansi); // Bit 30 not set = Unicode
    }
}