oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::ParseResult;
9use std::io::{Read, Seek};
10
11/// Text extraction options
12#[derive(Debug, Clone)]
13pub struct ExtractionOptions {
14    /// Preserve the original layout (spacing and positioning)
15    pub preserve_layout: bool,
16    /// Minimum space width to insert space character (in text space units)
17    pub space_threshold: f64,
18    /// Minimum vertical distance to insert newline (in text space units)
19    pub newline_threshold: f64,
20    /// Sort text fragments by position (useful for multi-column layouts)
21    pub sort_by_position: bool,
22    /// Detect and handle columns
23    pub detect_columns: bool,
24    /// Column separation threshold (in page units)
25    pub column_threshold: f64,
26    /// Merge hyphenated words at line ends
27    pub merge_hyphenated: bool,
28}
29
30impl Default for ExtractionOptions {
31    fn default() -> Self {
32        Self {
33            preserve_layout: false,
34            space_threshold: 0.2,
35            newline_threshold: 10.0,
36            sort_by_position: true,
37            detect_columns: false,
38            column_threshold: 50.0,
39            merge_hyphenated: true,
40        }
41    }
42}
43
44/// Extracted text with position information
45#[derive(Debug, Clone)]
46pub struct ExtractedText {
47    /// The extracted text content
48    pub text: String,
49    /// Text fragments with position information (if preserve_layout is true)
50    pub fragments: Vec<TextFragment>,
51}
52
53/// A fragment of text with position information
54#[derive(Debug, Clone)]
55pub struct TextFragment {
56    /// Text content
57    pub text: String,
58    /// X position in page coordinates
59    pub x: f64,
60    /// Y position in page coordinates
61    pub y: f64,
62    /// Width of the text
63    pub width: f64,
64    /// Height of the text
65    pub height: f64,
66    /// Font size
67    pub font_size: f64,
68}
69
70/// Text extraction state
71struct TextState {
72    /// Current text matrix
73    text_matrix: [f64; 6],
74    /// Current text line matrix
75    text_line_matrix: [f64; 6],
76    /// Current transformation matrix (CTM)
77    #[allow(dead_code)]
78    ctm: [f64; 6],
79    /// Text leading (line spacing)
80    leading: f64,
81    /// Character spacing
82    char_space: f64,
83    /// Word spacing
84    word_space: f64,
85    /// Horizontal scaling
86    horizontal_scale: f64,
87    /// Text rise
88    text_rise: f64,
89    /// Current font size
90    font_size: f64,
91    /// Current font name
92    font_name: Option<String>,
93    /// Render mode (0 = fill, 1 = stroke, etc.)
94    render_mode: u8,
95}
96
97impl Default for TextState {
98    fn default() -> Self {
99        Self {
100            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
101            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
102            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
103            leading: 0.0,
104            char_space: 0.0,
105            word_space: 0.0,
106            horizontal_scale: 100.0,
107            text_rise: 0.0,
108            font_size: 0.0,
109            font_name: None,
110            render_mode: 0,
111        }
112    }
113}
114
115/// Text extractor for PDF pages
116pub struct TextExtractor {
117    options: ExtractionOptions,
118}
119
120impl TextExtractor {
121    /// Create a new text extractor with default options
122    pub fn new() -> Self {
123        Self {
124            options: ExtractionOptions::default(),
125        }
126    }
127
128    /// Create a text extractor with custom options
129    pub fn with_options(options: ExtractionOptions) -> Self {
130        Self { options }
131    }
132
133    /// Extract text from a PDF document
134    pub fn extract_from_document<R: Read + Seek>(
135        &self,
136        document: &PdfDocument<R>,
137    ) -> ParseResult<Vec<ExtractedText>> {
138        let page_count = document.page_count()?;
139        let mut results = Vec::new();
140
141        for i in 0..page_count {
142            let text = self.extract_from_page(document, i)?;
143            results.push(text);
144        }
145
146        Ok(results)
147    }
148
149    /// Extract text from a specific page
150    pub fn extract_from_page<R: Read + Seek>(
151        &self,
152        document: &PdfDocument<R>,
153        page_index: u32,
154    ) -> ParseResult<ExtractedText> {
155        // Get the page
156        let page = document.get_page(page_index)?;
157
158        // Get content streams
159        let streams = page.content_streams_with_document(document)?;
160
161        let mut extracted_text = String::new();
162        let mut fragments = Vec::new();
163        let mut state = TextState::default();
164        let mut in_text_object = false;
165        let mut last_x = 0.0;
166        let mut last_y = 0.0;
167
168        // Process each content stream
169        for stream_data in streams {
170            let operations = ContentParser::parse_content(&stream_data)?;
171
172            for op in operations {
173                match op {
174                    ContentOperation::BeginText => {
175                        in_text_object = true;
176                        // Reset text matrix to identity
177                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
178                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
179                    }
180
181                    ContentOperation::EndText => {
182                        in_text_object = false;
183                    }
184
185                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
186                        state.text_matrix =
187                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
188                        state.text_line_matrix =
189                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
190                    }
191
192                    ContentOperation::MoveText(tx, ty) => {
193                        // Update text matrix by translation
194                        let new_matrix = multiply_matrix(
195                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
196                            &state.text_line_matrix,
197                        );
198                        state.text_matrix = new_matrix;
199                        state.text_line_matrix = new_matrix;
200                    }
201
202                    ContentOperation::NextLine => {
203                        // Move to next line using current leading
204                        let new_matrix = multiply_matrix(
205                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
206                            &state.text_line_matrix,
207                        );
208                        state.text_matrix = new_matrix;
209                        state.text_line_matrix = new_matrix;
210                    }
211
212                    ContentOperation::ShowText(text) => {
213                        if in_text_object {
214                            let text_bytes = &text;
215                            let decoded = self.decode_text(text_bytes, &state)?;
216
217                            // Calculate position
218                            let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
219
220                            // Add spacing based on position change
221                            if !extracted_text.is_empty() {
222                                let dx = x - last_x;
223                                let dy = (y - last_y).abs();
224
225                                if dy > self.options.newline_threshold {
226                                    extracted_text.push('\n');
227                                } else if dx > self.options.space_threshold * state.font_size {
228                                    extracted_text.push(' ');
229                                }
230                            }
231
232                            extracted_text.push_str(&decoded);
233
234                            if self.options.preserve_layout {
235                                fragments.push(TextFragment {
236                                    text: decoded.clone(),
237                                    x,
238                                    y,
239                                    width: calculate_text_width(&decoded, state.font_size),
240                                    height: state.font_size,
241                                    font_size: state.font_size,
242                                });
243                            }
244
245                            // Update position for next text
246                            last_x = x + calculate_text_width(&decoded, state.font_size);
247                            last_y = y;
248
249                            // Update text matrix for next show operation
250                            let text_width = calculate_text_width(&decoded, state.font_size);
251                            let tx = text_width * state.horizontal_scale / 100.0;
252                            state.text_matrix =
253                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
254                        }
255                    }
256
257                    ContentOperation::ShowTextArray(array) => {
258                        if in_text_object {
259                            for item in array {
260                                match item {
261                                    TextElement::Text(text_bytes) => {
262                                        let decoded = self.decode_text(&text_bytes, &state)?;
263                                        extracted_text.push_str(&decoded);
264
265                                        // Update text matrix
266                                        let text_width =
267                                            calculate_text_width(&decoded, state.font_size);
268                                        let tx = text_width * state.horizontal_scale / 100.0;
269                                        state.text_matrix = multiply_matrix(
270                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
271                                            &state.text_matrix,
272                                        );
273                                    }
274                                    TextElement::Spacing(adjustment) => {
275                                        // Text position adjustment (negative = move left)
276                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
277                                        state.text_matrix = multiply_matrix(
278                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
279                                            &state.text_matrix,
280                                        );
281                                    }
282                                }
283                            }
284                        }
285                    }
286
287                    ContentOperation::SetFont(name, size) => {
288                        state.font_name = Some(name);
289                        state.font_size = size as f64;
290                    }
291
292                    ContentOperation::SetLeading(leading) => {
293                        state.leading = leading as f64;
294                    }
295
296                    ContentOperation::SetCharSpacing(spacing) => {
297                        state.char_space = spacing as f64;
298                    }
299
300                    ContentOperation::SetWordSpacing(spacing) => {
301                        state.word_space = spacing as f64;
302                    }
303
304                    ContentOperation::SetHorizontalScaling(scale) => {
305                        state.horizontal_scale = scale as f64;
306                    }
307
308                    ContentOperation::SetTextRise(rise) => {
309                        state.text_rise = rise as f64;
310                    }
311
312                    ContentOperation::SetTextRenderMode(mode) => {
313                        state.render_mode = mode as u8;
314                    }
315
316                    _ => {
317                        // Other operations don't affect text extraction
318                    }
319                }
320            }
321        }
322
323        // Sort and process fragments if requested
324        if self.options.sort_by_position && !fragments.is_empty() {
325            self.sort_and_merge_fragments(&mut fragments);
326        }
327
328        // Reconstruct text from sorted fragments if layout is preserved
329        if self.options.preserve_layout && !fragments.is_empty() {
330            extracted_text = self.reconstruct_text_from_fragments(&fragments);
331        }
332
333        Ok(ExtractedText {
334            text: extracted_text,
335            fragments,
336        })
337    }
338
339    /// Sort text fragments by position and merge them appropriately
340    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
341        // Sort fragments by Y position (top to bottom) then X position (left to right)
342        fragments.sort_by(|a, b| {
343            // First compare Y position (with threshold for same line)
344            let y_diff = (b.y - a.y).abs();
345            if y_diff < self.options.newline_threshold {
346                // Same line, sort by X position
347                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
348            } else {
349                // Different lines, sort by Y (inverted because PDF Y increases upward)
350                b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
351            }
352        });
353
354        // Detect columns if requested
355        if self.options.detect_columns {
356            self.detect_and_sort_columns(fragments);
357        }
358    }
359
360    /// Detect columns and re-sort fragments accordingly
361    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
362        // Group fragments by approximate Y position
363        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
364        let mut current_line: Vec<&mut TextFragment> = Vec::new();
365        let mut last_y = f64::INFINITY;
366
367        for fragment in fragments.iter_mut() {
368            let fragment_y = fragment.y;
369            if (last_y - fragment_y).abs() > self.options.newline_threshold
370                && !current_line.is_empty()
371            {
372                lines.push(current_line);
373                current_line = Vec::new();
374            }
375            current_line.push(fragment);
376            last_y = fragment_y;
377        }
378        if !current_line.is_empty() {
379            lines.push(current_line);
380        }
381
382        // Detect column boundaries
383        let mut column_boundaries = vec![0.0];
384        for line in &lines {
385            if line.len() > 1 {
386                for i in 0..line.len() - 1 {
387                    let gap = line[i + 1].x - (line[i].x + line[i].width);
388                    if gap > self.options.column_threshold {
389                        let boundary = line[i].x + line[i].width + gap / 2.0;
390                        if !column_boundaries
391                            .iter()
392                            .any(|&b| (b - boundary).abs() < 10.0)
393                        {
394                            column_boundaries.push(boundary);
395                        }
396                    }
397                }
398            }
399        }
400        column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
401
402        // Re-sort fragments by column then Y position
403        if column_boundaries.len() > 1 {
404            fragments.sort_by(|a, b| {
405                // Determine column for each fragment
406                let col_a = column_boundaries
407                    .iter()
408                    .position(|&boundary| a.x < boundary)
409                    .unwrap_or(column_boundaries.len())
410                    - 1;
411                let col_b = column_boundaries
412                    .iter()
413                    .position(|&boundary| b.x < boundary)
414                    .unwrap_or(column_boundaries.len())
415                    - 1;
416
417                if col_a != col_b {
418                    col_a.cmp(&col_b)
419                } else {
420                    // Same column, sort by Y position
421                    b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
422                }
423            });
424        }
425    }
426
427    /// Reconstruct text from sorted fragments
428    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
429        let mut result = String::new();
430        let mut last_y = f64::INFINITY;
431        let mut last_x = 0.0;
432        let mut last_line_ended_with_hyphen = false;
433
434        for fragment in fragments {
435            // Check if we need a newline
436            let y_diff = (last_y - fragment.y).abs();
437            if !result.is_empty() && y_diff > self.options.newline_threshold {
438                // Handle hyphenation
439                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
440                    // Remove the hyphen and don't add newline
441                    if result.ends_with('-') {
442                        result.pop();
443                    }
444                } else {
445                    result.push('\n');
446                }
447            } else if !result.is_empty() {
448                // Check if we need a space
449                let x_gap = fragment.x - last_x;
450                if x_gap > self.options.space_threshold * fragment.font_size {
451                    result.push(' ');
452                }
453            }
454
455            result.push_str(&fragment.text);
456            last_line_ended_with_hyphen = fragment.text.ends_with('-');
457            last_y = fragment.y;
458            last_x = fragment.x + fragment.width;
459        }
460
461        result
462    }
463
464    /// Decode text using the current font encoding
465    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
466        use crate::text::encoding::TextEncoding;
467
468        // Try to determine encoding from font name
469        let encoding = if let Some(ref font_name) = state.font_name {
470            match font_name.to_lowercase().as_str() {
471                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
472                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
473                name if name.contains("standard") => TextEncoding::StandardEncoding,
474                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
475                _ => {
476                    // Default based on common patterns
477                    if font_name.starts_with("Times")
478                        || font_name.starts_with("Helvetica")
479                        || font_name.starts_with("Courier")
480                    {
481                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
482                    } else {
483                        TextEncoding::PdfDocEncoding // Safe default
484                    }
485                }
486            }
487        } else {
488            TextEncoding::WinAnsiEncoding // Default for most PDFs
489        };
490
491        Ok(encoding.decode(text))
492    }
493}
494
495impl Default for TextExtractor {
496    fn default() -> Self {
497        Self::new()
498    }
499}
500
501/// Multiply two transformation matrices
502fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
503    [
504        a[0] * b[0] + a[1] * b[2],
505        a[0] * b[1] + a[1] * b[3],
506        a[2] * b[0] + a[3] * b[2],
507        a[2] * b[1] + a[3] * b[3],
508        a[4] * b[0] + a[5] * b[2] + b[4],
509        a[4] * b[1] + a[5] * b[3] + b[5],
510    ]
511}
512
513/// Transform a point using a transformation matrix
514fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
515    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
516    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
517    (tx, ty)
518}
519
520/// Calculate approximate text width (simplified)
521fn calculate_text_width(text: &str, font_size: f64) -> f64 {
522    // Approximate: assume average character width is 0.5 * font_size
523    text.len() as f64 * font_size * 0.5
524}
525
526#[cfg(test)]
527mod tests {
528    use super::*;
529
530    #[test]
531    fn test_matrix_multiplication() {
532        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
533        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
534
535        let result = multiply_matrix(&identity, &translation);
536        assert_eq!(result, translation);
537
538        let result2 = multiply_matrix(&translation, &identity);
539        assert_eq!(result2, translation);
540    }
541
542    #[test]
543    fn test_transform_point() {
544        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
545        let (x, y) = transform_point(5.0, 5.0, &translation);
546        assert_eq!(x, 15.0);
547        assert_eq!(y, 25.0);
548    }
549
550    #[test]
551    fn test_extraction_options_default() {
552        let options = ExtractionOptions::default();
553        assert!(!options.preserve_layout);
554        assert_eq!(options.space_threshold, 0.2);
555        assert_eq!(options.newline_threshold, 10.0);
556        assert!(options.sort_by_position);
557        assert!(!options.detect_columns);
558        assert_eq!(options.column_threshold, 50.0);
559        assert!(options.merge_hyphenated);
560    }
561
562    #[test]
563    fn test_extraction_options_custom() {
564        let options = ExtractionOptions {
565            preserve_layout: true,
566            space_threshold: 0.5,
567            newline_threshold: 15.0,
568            sort_by_position: false,
569            detect_columns: true,
570            column_threshold: 75.0,
571            merge_hyphenated: false,
572        };
573        assert!(options.preserve_layout);
574        assert_eq!(options.space_threshold, 0.5);
575        assert_eq!(options.newline_threshold, 15.0);
576        assert!(!options.sort_by_position);
577        assert!(options.detect_columns);
578        assert_eq!(options.column_threshold, 75.0);
579        assert!(!options.merge_hyphenated);
580    }
581
582    #[test]
583    fn test_text_fragment() {
584        let fragment = TextFragment {
585            text: "Hello".to_string(),
586            x: 100.0,
587            y: 200.0,
588            width: 50.0,
589            height: 12.0,
590            font_size: 10.0,
591        };
592        assert_eq!(fragment.text, "Hello");
593        assert_eq!(fragment.x, 100.0);
594        assert_eq!(fragment.y, 200.0);
595        assert_eq!(fragment.width, 50.0);
596        assert_eq!(fragment.height, 12.0);
597        assert_eq!(fragment.font_size, 10.0);
598    }
599
600    #[test]
601    fn test_extracted_text() {
602        let fragments = vec![
603            TextFragment {
604                text: "Hello".to_string(),
605                x: 100.0,
606                y: 200.0,
607                width: 50.0,
608                height: 12.0,
609                font_size: 10.0,
610            },
611            TextFragment {
612                text: "World".to_string(),
613                x: 160.0,
614                y: 200.0,
615                width: 50.0,
616                height: 12.0,
617                font_size: 10.0,
618            },
619        ];
620
621        let extracted = ExtractedText {
622            text: "Hello World".to_string(),
623            fragments: fragments.clone(),
624        };
625
626        assert_eq!(extracted.text, "Hello World");
627        assert_eq!(extracted.fragments.len(), 2);
628        assert_eq!(extracted.fragments[0].text, "Hello");
629        assert_eq!(extracted.fragments[1].text, "World");
630    }
631
632    #[test]
633    fn test_text_state_default() {
634        let state = TextState::default();
635        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
636        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
637        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
638        assert_eq!(state.leading, 0.0);
639        assert_eq!(state.char_space, 0.0);
640        assert_eq!(state.word_space, 0.0);
641        assert_eq!(state.horizontal_scale, 100.0);
642        assert_eq!(state.text_rise, 0.0);
643        assert_eq!(state.font_size, 0.0);
644        assert!(state.font_name.is_none());
645        assert_eq!(state.render_mode, 0);
646    }
647
648    #[test]
649    fn test_matrix_operations() {
650        // Test rotation matrix
651        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
652        let (x, y) = transform_point(1.0, 0.0, &rotation);
653        assert_eq!(x, 0.0);
654        assert_eq!(y, 1.0);
655
656        // Test scaling matrix
657        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
658        let (x, y) = transform_point(5.0, 5.0, &scale);
659        assert_eq!(x, 10.0);
660        assert_eq!(y, 15.0);
661
662        // Test complex transformation
663        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
664        let (x, y) = transform_point(1.0, 1.0, &complex);
665        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
666        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
667    }
668
669    #[test]
670    fn test_text_extractor_new() {
671        let extractor = TextExtractor::new();
672        let options = extractor.options;
673        assert!(!options.preserve_layout);
674        assert_eq!(options.space_threshold, 0.2);
675        assert_eq!(options.newline_threshold, 10.0);
676        assert!(options.sort_by_position);
677        assert!(!options.detect_columns);
678        assert_eq!(options.column_threshold, 50.0);
679        assert!(options.merge_hyphenated);
680    }
681
682    #[test]
683    fn test_text_extractor_with_options() {
684        let options = ExtractionOptions {
685            preserve_layout: true,
686            space_threshold: 0.3,
687            newline_threshold: 12.0,
688            sort_by_position: false,
689            detect_columns: true,
690            column_threshold: 60.0,
691            merge_hyphenated: false,
692        };
693        let extractor = TextExtractor::with_options(options.clone());
694        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
695        assert_eq!(extractor.options.space_threshold, options.space_threshold);
696        assert_eq!(
697            extractor.options.newline_threshold,
698            options.newline_threshold
699        );
700        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
701        assert_eq!(extractor.options.detect_columns, options.detect_columns);
702        assert_eq!(extractor.options.column_threshold, options.column_threshold);
703        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
704    }
705}