litchi 0.0.1

High-performance parser for Microsoft Office, OpenDocument, and Apple iWork file formats with unified API
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
/// Proper paragraph extraction from binary structures.
///
/// Based on Apache POI's HWPF paragraph parsing logic, this module
/// extracts paragraphs using PLCF (Property List with Character Positions)
/// structures for paragraph boundaries (PAP) and character runs (CHP).
use super::super::package::Result;
use super::fib::FileInformationBlock;
use super::pap::ParagraphProperties;
use super::chp::CharacterProperties;
use super::chp_bin_table::ChpBinTable;
use super::piece_table::PieceTable;
use crate::ole::binary::PlcfParser;
use crate::ole::sprm::parse_sprms;

/// Type alias for extracted paragraph data: (text, properties, runs).
type ExtractedParagraph = (String, ParagraphProperties, Vec<(String, CharacterProperties)>);

/// Paragraph extractor using binary structures.
///
/// Based on Apache POI's ParagraphPropertiesTable (PAPBinTable) and
/// CharacterPropertiesTable (CHPBinTable).
pub struct ParagraphExtractor {
    /// Paragraph property data (PLCF)
    pap_plcf: Option<PlcfParser>,
    /// Character property bin table (properly parsed with FKP)
    chp_bin_table: Option<ChpBinTable>,
    /// The extracted text
    text: String,
    /// Text piece character positions
    text_ranges: Vec<(u32, u32, usize)>, // (cp_start, cp_end, text_offset)
    /// Character position range to extract (for subdocuments)
    cp_range: Option<(u32, u32)>,
}

impl ParagraphExtractor {
    /// Create a new paragraph extractor.
    ///
    /// # Arguments
    ///
    /// * `fib` - File Information Block
    /// * `table_stream` - Table stream (0Table or 1Table) data
    /// * `word_document` - WordDocument stream data
    /// * `text` - Extracted document text
    pub fn new(
        fib: &FileInformationBlock,
        table_stream: &[u8],
        word_document: &[u8],
        text: String,
    ) -> Result<Self> {
        // Debug: Print all FIB table pointers to find CLX
        eprintln!("DEBUG: FIB table pointers with non-zero length:");
        for i in 0..50 {
            if let Some((offset, length)) = fib.get_table_pointer(i)
                && length > 0 {
                    eprintln!("DEBUG:   Index {}: offset={}, length={}", i, offset, length);
                }
        }
        
        // Get PAP bin table location from FIB
        // Index 13 in FibRgFcLcb97 is fcPlcfBtePapx/lcbPlcfBtePapx (PLCFBTEPAPX)
        let pap_plcf = if let Some((offset, length)) = fib.get_table_pointer(13) {
            eprintln!("DEBUG: PAP table pointer: offset={}, length={}", offset, length);
            if length > 0 && (offset as usize) < table_stream.len() {
                let pap_data = &table_stream[offset as usize..];
                let pap_len = length.min((table_stream.len() - offset as usize) as u32) as usize;
                eprintln!("DEBUG: PAP data length={}", pap_len);
                if pap_len >= 4 {
                    // PAP PLCF uses 4-byte property descriptors initially
                    // Each entry points to a PAPX (paragraph properties) structure
                    let result = PlcfParser::parse(&pap_data[..pap_len], 4);
                    eprintln!("DEBUG: PAP PLCF parsed: {:?}", result.is_some());
                    if let Some(ref plcf) = result {
                        eprintln!("DEBUG: PAP PLCF has {} entries", plcf.count());
                    }
                    result
                } else {
                    eprintln!("DEBUG: PAP data too small");
                    None
                }
            } else {
                eprintln!("DEBUG: Invalid PAP offset/length");
                None
            }
        } else {
            eprintln!("DEBUG: No PAP table pointer in FIB");
            None
        };

        // Parse piece table from CLX (Complex file information)
        // According to [MS-DOC], fcClx is at FIB offset 0x01A2
        // In FibRgFcLcb97 (starting at FIB offset 154), this is index 33: (0x01A2 - 154) / 8 = 33
        let piece_table = if let Some((offset, length)) = fib.get_table_pointer(33) {
            eprintln!("DEBUG: CLX pointer: offset={}, length={}", offset, length);
            if length > 0 && (offset as usize) < table_stream.len() {
                let clx_data = &table_stream[offset as usize..];
                let clx_len = length.min((table_stream.len() - offset as usize) as u32) as usize;
                eprintln!("DEBUG: CLX data length={}", clx_len);
                let result = PieceTable::parse(&clx_data[..clx_len]);
                if let Some(ref pt) = result {
                    eprintln!("DEBUG: PieceTable has {} pieces, total_cps={}", pt.pieces().len(), pt.total_cps());
                } else {
                    eprintln!("DEBUG: Failed to parse PieceTable");
                }
                result
            } else {
                eprintln!("DEBUG: Invalid CLX offset/length");
                None
            }
        } else {
            eprintln!("DEBUG: No CLX pointer in FIB");
            None
        };

        // Get CHP bin table location from FIB and parse it properly with FKP
        // Index 12 in FibRgFcLcb97 is fcPlcfBteChpx/lcbPlcfBteChpx (PLCFBTECHPX)
        // Requires piece table for FC-to-CP conversion
        let chp_bin_table = if let (Some((offset, length)), Some(pt)) = (fib.get_table_pointer(12), &piece_table) {
            eprintln!("DEBUG: CHP table pointer: offset={}, length={}", offset, length);
            if length > 0 && (offset as usize) < table_stream.len() {
                let chp_data = &table_stream[offset as usize..];
                let chp_len = length.min((table_stream.len() - offset as usize) as u32) as usize;
                eprintln!("DEBUG: CHP data length={}", chp_len);
                if chp_len >= 8 {
                    // Parse CHPBinTable (PlcfBteChpx with FKP pages)
                    // FKP pages are in WordDocument stream, not table stream!
                    let result = ChpBinTable::parse(&chp_data[..chp_len], word_document, pt);
                    if let Some(ref bin_table) = result {
                        eprintln!("DEBUG: ChpBinTable has {} runs", bin_table.runs().len());
                    } else {
                        eprintln!("DEBUG: Failed to parse ChpBinTable");
                    }
                    result
                } else {
                    eprintln!("DEBUG: CHP data too small");
                    None
                }
            } else {
                eprintln!("DEBUG: Invalid CHP offset/length");
                None
            }
        } else {
            if piece_table.is_none() {
                eprintln!("DEBUG: No piece table available for CHP parsing");
            } else {
                eprintln!("DEBUG: No CHP table pointer in FIB");
            }
            None
        };

        // Build text ranges for mapping CPs to text offsets
        let text_ranges = Self::build_text_ranges(&text);

        Ok(Self {
            pap_plcf,
            chp_bin_table,
            text,
            text_ranges,
            cp_range: None,
        })
    }

    /// Create a new paragraph extractor for a specific character position range.
    ///
    /// This is used to extract paragraphs from subdocuments (footnotes, headers, etc.).
    ///
    /// # Arguments
    ///
    /// * `fib` - File Information Block
    /// * `table_stream` - Table stream (0Table or 1Table) data
    /// * `word_document` - WordDocument stream data
    /// * `text` - Extracted document text
    /// * `cp_range` - Character position range (start_cp, end_cp)
    pub fn new_with_range(
        fib: &FileInformationBlock,
        table_stream: &[u8],
        word_document: &[u8],
        text: String,
        cp_range: (u32, u32),
    ) -> Result<Self> {
        let mut extractor = Self::new(fib, table_stream, word_document, text)?;
        extractor.cp_range = Some(cp_range);
        Ok(extractor)
    }

    /// Build mapping from character positions to text offsets.
    fn build_text_ranges(text: &str) -> Vec<(u32, u32, usize)> {
        let mut ranges = Vec::new();
        let mut offset = 0usize;

        for (cp, ch) in text.chars().enumerate() {
            let char_len = ch.len_utf8();
            let cp_u32 = cp as u32;
            ranges.push((cp_u32, cp_u32 + 1, offset));
            offset += char_len;
        }

        ranges
    }

    /// Extract paragraphs with properties.
    ///
    /// Returns a vector of (text, paragraph_properties, character_runs) tuples.
    pub fn extract_paragraphs(&self) -> Result<Vec<ExtractedParagraph>> {
        let mut paragraphs = Vec::new();

        if let Some(ref pap_plcf) = self.pap_plcf {
            // Iterate through paragraph boundaries
            for i in 0..pap_plcf.count() {
                if let Some((mut para_start, mut para_end)) = pap_plcf.range(i) {
                    // Filter by CP range if specified (for subdocuments)
                    if let Some((range_start, range_end)) = self.cp_range {
                        // Skip paragraphs outside our range
                        if para_end <= range_start || para_start >= range_end {
                            continue;
                        }
                        // Clamp paragraph boundaries to our range
                        para_start = para_start.max(range_start);
                        para_end = para_end.min(range_end);
                    }
                    // Extract paragraph text
                    let para_text = self.extract_text_range(para_start, para_end);

                    // Parse paragraph properties
                    let para_props = if let Some(prop_data) = pap_plcf.property(i) {
                        // Property data points to a PAPX structure
                        // For now, use default properties - full implementation would
                        // follow the PAPX pointer to get actual properties
                        Self::parse_papx(prop_data).unwrap_or_default()
                    } else {
                        ParagraphProperties::default()
                    };

                    // Extract character runs within this paragraph
                    let runs = self.extract_runs(para_start, para_end)?;

                    paragraphs.push((para_text, para_props, runs));
                }
            }
        } else {
            eprintln!("DEBUG: Using fallback paragraph extraction (no PAP PLCF)");
            // Fallback: split by newlines if no PLCF data
            // But still try to extract character runs based on CHP
            let mut char_pos = 0u32;
            for line in self.text.lines() {
                let line_len = line.chars().count() as u32 + 1; // +1 for newline
                let line_end = char_pos + line_len;
                
                // Extract character runs for this line
                let runs = if !line.is_empty() {
                    self.extract_runs(char_pos, line_end).unwrap_or_else(|_| {
                        vec![(line.to_string(), CharacterProperties::default())]
                    })
                } else {
                    vec![(String::new(), CharacterProperties::default())]
                };
                
                paragraphs.push((
                    line.to_string(),
                    ParagraphProperties::default(),
                    runs,
                ));
                
                char_pos = line_end;
            }
        }

        Ok(paragraphs)
    }

    /// Extract text for a character position range.
    fn extract_text_range(&self, cp_start: u32, cp_end: u32) -> String {
        // Clamp CPs to valid range
        let max_cp = self.text_ranges.len() as u32;
        let cp_start_clamped = cp_start.min(max_cp);
        let cp_end_clamped = cp_end.min(max_cp);
        
        if cp_start_clamped >= cp_end_clamped {
            return String::new();
        }
        
        let start_idx = cp_start_clamped as usize;
        let end_idx = cp_end_clamped as usize;

        if start_idx < self.text_ranges.len() {
            let start_offset = self.text_ranges[start_idx].2;
            let end_offset = if end_idx < self.text_ranges.len() {
                self.text_ranges[end_idx].2
            } else {
                self.text.len()
            };

            if start_offset <= end_offset {
                self.text[start_offset..end_offset].to_string()
            } else {
                String::new()
            }
        } else {
            String::new()
        }
    }

    /// Extract character runs (formatted text segments) within a paragraph.
    fn extract_runs(
        &self,
        para_start: u32,
        para_end: u32,
    ) -> Result<Vec<(String, CharacterProperties)>> {
        let mut runs = Vec::new();

        if let Some(ref chp_bin_table) = self.chp_bin_table {
            static mut DEBUG_COUNT: usize = 0;
            unsafe {
                DEBUG_COUNT += 1;
                if DEBUG_COUNT <= 3 {
                    eprintln!("DEBUG: extract_runs called, para_start={}, para_end={}, total_runs={}", 
                              para_start, para_end, chp_bin_table.runs().len());
                }
            }
            
            // Get runs that overlap with this paragraph
            let overlapping_runs = chp_bin_table.runs_in_range(para_start, para_end);
            
            let debug_count = unsafe { DEBUG_COUNT };
            if debug_count <= 3 {
                eprintln!("DEBUG:   Found {} overlapping runs", overlapping_runs.len());
            }
            
            for run in overlapping_runs {
                // Calculate actual run boundaries within paragraph
                let actual_start = run.start_cp.max(para_start);
                let actual_end = run.end_cp.min(para_end);

                // Extract run text
                let run_text = self.extract_text_range(actual_start, actual_end);
                
                if debug_count <= 3 && runs.len() < 5 {
                    eprintln!("DEBUG:     Run: cp={}..{}, is_ole2={}, pic_offset={:?}, text_len={}", 
                             actual_start, actual_end, run.properties.is_ole2, 
                             run.properties.pic_offset, run_text.len());
                }

                runs.push((run_text, run.properties.clone()));
            }
        }

        // If no runs found, return the whole paragraph as one run
        if runs.is_empty() {
            let para_text = self.extract_text_range(para_start, para_end);
            runs.push((para_text, CharacterProperties::default()));
        }

        Ok(runs)
    }

    /// Parse PAPX (Paragraph Property eXceptions) data.
    ///
    /// Based on Apache POI's PAPX.getParagraphProperties().
    fn parse_papx(prop_data: &[u8]) -> Result<ParagraphProperties> {
        if prop_data.len() < 4 {
            return Ok(ParagraphProperties::default());
        }

        // PAPX format (simplified):
        // - 4 bytes: pointer to actual PAPX data in data stream (for file-based) or length
        // In memory structures, this often contains inline SPRM data

        // For now, try to parse as SPRM data directly
        // Parse SPRMs (always 2-byte opcodes per Apache POI)
        let sprms = parse_sprms(prop_data);

        // Apply SPRMs to create paragraph properties
        let mut props = ParagraphProperties::default();

        for sprm in &sprms {
            // Apply common paragraph SPRMs
            match sprm.opcode {
                0x2403 | 0x0003 => {
                    // Justification
                    if let Some(val) = sprm.operand_byte() {
                        props.justification = match val {
                            0 => super::pap::Justification::Left,
                            1 => super::pap::Justification::Center,
                            2 => super::pap::Justification::Right,
                            3 => super::pap::Justification::Justified,
                            _ => super::pap::Justification::Left,
                        };
                    }
                }
                0x840F | 0x000F => {
                    // Left indent
                    props.indent_left = sprm.operand_i16().map(|v| v as i32);
                }
                0x8411 | 0x0011 => {
                    // Right indent
                    props.indent_right = sprm.operand_i16().map(|v| v as i32);
                }
                0x2416 => {
                    // sprmPFInTable - paragraph is in a table
                    props.in_table = sprm.operand_byte().unwrap_or(0) != 0;
                }
                0x2417 => {
                    // sprmPFTtp - table row end marker (table trailer paragraph)
                    props.is_table_row_end = sprm.operand_byte().unwrap_or(0) != 0;
                }
                0x6649 => {
                    // sprmPItap - table nesting level (4-byte operand)
                    props.table_level = sprm.operand_dword().unwrap_or(0) as i32;
                }
                _ => {}
            }
        }

        Ok(props)
    }

}