Skip to main content

oxidize_pdf/text/plaintext/
extractor.rs

1//! Plain text extractor implementation with simplified API
2//!
3//! This module provides simplified text extraction that returns clean strings
4//! instead of position-annotated fragments.
5
6use super::types::{LineBreakMode, PlainTextConfig, PlainTextResult};
7use crate::parser::content::{ContentOperation, ContentParser};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::encoding::TextEncoding;
13use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
14use std::collections::HashMap;
15use std::io::{Read, Seek};
16
17/// Identity transformation matrix
18const IDENTITY: [f64; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
19
20/// Text state for PDF text rendering
21#[derive(Debug, Clone)]
22struct TextState {
23    text_matrix: [f64; 6],
24    text_line_matrix: [f64; 6],
25    leading: f64,
26    font_size: f64,
27    font_name: Option<String>,
28}
29
30impl Default for TextState {
31    fn default() -> Self {
32        Self {
33            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
34            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
35            leading: 0.0,
36            font_size: 0.0,
37            font_name: None,
38        }
39    }
40}
41
42/// Plain text extractor with simplified API
43///
44/// Extracts text from PDF pages without maintaining position information,
45/// providing a simpler API by returning `String` and `Vec<String>` instead
46/// of `Vec<TextFragment>`.
47///
48/// # Architecture
49///
50/// This extractor uses the same content stream parser as `TextExtractor`,
51/// but discards position metadata to provide a simpler output format. It
52/// tracks minimal position data (x, y coordinates) to determine spacing
53/// and line breaks, then returns clean text strings.
54///
55/// # Performance Characteristics
56///
57/// - **Memory**: O(1) position tracking vs O(n) fragments
58/// - **CPU**: No fragment sorting, no width calculations
59/// - **Performance**: Comparable to `TextExtractor` (same parser)
60///
61/// # Thread Safety
62///
63/// `PlainTextExtractor` is thread-safe and can be reused across multiple
64/// pages and documents. Create once, use many times.
65///
66/// # Examples
67///
68/// ## Basic Usage
69///
70/// ```no_run
71/// use oxidize_pdf::parser::PdfReader;
72/// use oxidize_pdf::text::plaintext::PlainTextExtractor;
73///
74/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
75/// let doc = PdfReader::open_document("document.pdf")?;
76///
77/// let mut extractor = PlainTextExtractor::new();
78/// let result = extractor.extract(&doc, 0)?;
79///
80/// println!("{}", result.text);
81/// # Ok(())
82/// # }
83/// ```
84///
85/// ## Custom Configuration
86///
87/// ```no_run
88/// use oxidize_pdf::parser::PdfReader;
89/// use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
90///
91/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
92/// let doc = PdfReader::open_document("document.pdf")?;
93///
94/// let config = PlainTextConfig {
95///     space_threshold: 0.3,
96///     newline_threshold: 12.0,
97///     preserve_layout: true,
98///     line_break_mode: oxidize_pdf::text::plaintext::LineBreakMode::Normalize,
99/// };
100///
101/// let mut extractor = PlainTextExtractor::with_config(config);
102/// let result = extractor.extract(&doc, 0)?;
103/// # Ok(())
104/// # }
105/// ```
106pub struct PlainTextExtractor {
107    /// Configuration for extraction
108    config: PlainTextConfig,
109    /// Font cache for decoding text
110    font_cache: HashMap<String, FontInfo>,
111    /// Cached CMap extractor for text decoding (reused across ShowText operations)
112    cmap_extractor: CMapTextExtractor<std::fs::File>,
113}
114
115impl Default for PlainTextExtractor {
116    fn default() -> Self {
117        Self::new()
118    }
119}
120
121impl PlainTextExtractor {
122    /// Create a new extractor with default configuration
123    ///
124    /// # Examples
125    ///
126    /// ```
127    /// use oxidize_pdf::text::plaintext::PlainTextExtractor;
128    ///
129    /// let extractor = PlainTextExtractor::new();
130    /// ```
131    pub fn new() -> Self {
132        Self {
133            config: PlainTextConfig::default(),
134            font_cache: HashMap::new(),
135            cmap_extractor: CMapTextExtractor::new(),
136        }
137    }
138
139    /// Create a new extractor with custom configuration
140    ///
141    /// # Examples
142    ///
143    /// ```
144    /// use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
145    ///
146    /// let config = PlainTextConfig::dense();
147    /// let extractor = PlainTextExtractor::with_config(config);
148    /// ```
149    pub fn with_config(config: PlainTextConfig) -> Self {
150        Self {
151            config,
152            font_cache: HashMap::new(),
153            cmap_extractor: CMapTextExtractor::new(),
154        }
155    }
156
157    /// Extract plain text from a PDF page
158    ///
159    /// Returns text with spaces and newlines inserted according to the
160    /// configured thresholds. Position information is not included in
161    /// the result.
162    ///
163    /// # Output
164    ///
165    /// Returns a `PlainTextResult` containing the extracted text as a `String`,
166    /// along with character count and line count metadata. This is simpler than
167    /// `TextExtractor` which returns `Vec<TextFragment>` with position data.
168    ///
169    /// # Examples
170    ///
171    /// ```no_run
172    /// use oxidize_pdf::parser::PdfReader;
173    /// use oxidize_pdf::text::plaintext::PlainTextExtractor;
174    ///
175    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
176    /// let doc = PdfReader::open_document("document.pdf")?;
177    ///
178    /// let mut extractor = PlainTextExtractor::new();
179    /// let result = extractor.extract(&doc, 0)?; // page index 0 = first page
180    ///
181    /// println!("Extracted {} characters", result.char_count);
182    /// # Ok(())
183    /// # }
184    /// ```
185    pub fn extract<R: Read + Seek>(
186        &mut self,
187        document: &PdfDocument<R>,
188        page_index: u32,
189    ) -> ParseResult<PlainTextResult> {
190        // Get the page
191        let page = document.get_page(page_index)?;
192
193        // Extract font resources
194        self.extract_font_resources(&page, document)?;
195
196        // Get content streams
197        let streams = page.content_streams_with_document(document)?;
198
199        // Pre-allocate String capacity to avoid reallocations
200        let mut extracted_text = String::with_capacity(4096);
201        let mut state = TextState::default();
202        let mut in_text_object = false;
203        let mut last_x = 0.0;
204        let mut last_y = 0.0;
205
206        // Process each content stream
207        for stream_data in streams {
208            let operations = match ContentParser::parse_content(&stream_data) {
209                Ok(ops) => ops,
210                Err(e) => {
211                    tracing::debug!("Warning: Failed to parse content stream, skipping: {}", e);
212                    continue;
213                }
214            };
215
216            for op in operations {
217                match op {
218                    ContentOperation::BeginText => {
219                        in_text_object = true;
220                        state.text_matrix = IDENTITY;
221                        state.text_line_matrix = IDENTITY;
222                    }
223
224                    ContentOperation::EndText => {
225                        in_text_object = false;
226                    }
227
228                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
229                        state.text_matrix =
230                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
231                        state.text_line_matrix =
232                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
233                    }
234
235                    ContentOperation::MoveText(tx, ty) => {
236                        let new_matrix = multiply_matrix(
237                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
238                            &state.text_line_matrix,
239                        );
240                        state.text_matrix = new_matrix;
241                        state.text_line_matrix = new_matrix;
242                    }
243
244                    ContentOperation::NextLine => {
245                        let new_matrix = multiply_matrix(
246                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
247                            &state.text_line_matrix,
248                        );
249                        state.text_matrix = new_matrix;
250                        state.text_line_matrix = new_matrix;
251                    }
252
253                    ContentOperation::ShowText(text) => {
254                        if in_text_object {
255                            let decoded = self.decode_text::<R>(&text, &state)?;
256
257                            // Calculate position (only x, y - no width/height needed)
258                            let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
259
260                            // Add spacing based on position change
261                            if !extracted_text.is_empty() {
262                                let dx = x - last_x;
263                                let dy = (y - last_y).abs();
264
265                                if dy > self.config.newline_threshold {
266                                    extracted_text.push('\n');
267                                } else if dx > self.config.space_threshold * state.font_size {
268                                    extracted_text.push(' ');
269                                }
270                            }
271
272                            extracted_text.push_str(&decoded);
273                            last_x = x;
274                            last_y = y;
275                        }
276                    }
277
278                    ContentOperation::SetFont(name, size) => {
279                        state.font_name = Some(name);
280                        state.font_size = size as f64;
281                    }
282
283                    ContentOperation::SetLeading(leading) => {
284                        state.leading = leading as f64;
285                    }
286
287                    _ => {
288                        // Ignore other operations (no graphics state needed for text extraction)
289                    }
290                }
291            }
292        }
293
294        // Apply line break mode processing
295        let processed_text = self.apply_line_break_mode(&extracted_text);
296
297        Ok(PlainTextResult::new(processed_text))
298    }
299
300    /// Extract text as individual lines
301    ///
302    /// Returns a vector of strings, one for each line detected in the page.
303    /// Useful for grep-like operations or line-based processing.
304    ///
305    /// # Examples
306    ///
307    /// ```no_run
308    /// use oxidize_pdf::parser::PdfReader;
309    /// use oxidize_pdf::text::plaintext::PlainTextExtractor;
310    ///
311    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
312    /// let doc = PdfReader::open_document("document.pdf")?;
313    ///
314    /// let mut extractor = PlainTextExtractor::new();
315    /// let lines = extractor.extract_lines(&doc, 0)?;
316    ///
317    /// for (i, line) in lines.iter().enumerate() {
318    ///     println!("{}: {}", i + 1, line);
319    /// }
320    /// # Ok(())
321    /// # }
322    /// ```
323    pub fn extract_lines<R: Read + Seek>(
324        &mut self,
325        document: &PdfDocument<R>,
326        page_index: u32,
327    ) -> ParseResult<Vec<String>> {
328        let result = self.extract(document, page_index)?;
329
330        Ok(result.text.lines().map(|line| line.to_string()).collect())
331    }
332
333    /// Extract font resources from the page
334    fn extract_font_resources<R: Read + Seek>(
335        &mut self,
336        page: &ParsedPage,
337        document: &PdfDocument<R>,
338    ) -> ParseResult<()> {
339        // Cache fonts persistently across pages (improves multi-page extraction)
340        // Font cache is only cleared when extractor is recreated
341
342        // Get page resources
343        if let Some(resources) = page.get_resources() {
344            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
345                // Extract each font
346                for (font_name, font_obj) in font_dict.0.iter() {
347                    if let Some(font_ref) = font_obj.as_reference() {
348                        if let Ok(PdfObject::Dictionary(font_dict)) =
349                            document.get_object(font_ref.0, font_ref.1)
350                        {
351                            // Create a CMap extractor to use its font extraction logic
352                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
353
354                            if let Ok(font_info) =
355                                cmap_extractor.extract_font_info(&font_dict, document)
356                            {
357                                self.font_cache.insert(font_name.0.clone(), font_info);
358                            }
359                        }
360                    }
361                }
362            }
363        }
364
365        Ok(())
366    }
367
368    /// Decode text using CMap if available
369    fn decode_text<R: Read + Seek>(
370        &self,
371        text_bytes: &[u8],
372        state: &TextState,
373    ) -> ParseResult<String> {
374        // Try CMap-based decoding first (using cached extractor)
375        if let Some(ref font_name) = state.font_name {
376            if let Some(font_info) = self.font_cache.get(font_name) {
377                if let Ok(decoded) = self
378                    .cmap_extractor
379                    .decode_text_with_font(text_bytes, font_info)
380                {
381                    return Ok(decoded);
382                }
383            }
384        }
385
386        // Fallback to encoding-based decoding (avoid allocation with case-insensitive check)
387        let encoding = if let Some(ref font_name) = state.font_name {
388            // Check for encoding type without allocating lowercase string
389            let font_lower = font_name.as_bytes();
390            if font_lower
391                .iter()
392                .any(|&b| b.to_ascii_lowercase() == b'r' && font_name.contains("roman"))
393            {
394                TextEncoding::MacRomanEncoding
395            } else if font_name.contains("WinAnsi") || font_name.contains("winansi") {
396                TextEncoding::WinAnsiEncoding
397            } else if font_name.contains("Standard") || font_name.contains("standard") {
398                TextEncoding::StandardEncoding
399            } else if font_name.contains("PdfDoc") || font_name.contains("pdfdoc") {
400                TextEncoding::PdfDocEncoding
401            } else if font_name.starts_with("Times")
402                || font_name.starts_with("Helvetica")
403                || font_name.starts_with("Courier")
404            {
405                TextEncoding::WinAnsiEncoding
406            } else {
407                TextEncoding::PdfDocEncoding
408            }
409        } else {
410            TextEncoding::WinAnsiEncoding
411        };
412
413        Ok(encoding.decode(text_bytes))
414    }
415
416    /// Apply line break mode processing
417    fn apply_line_break_mode(&self, text: &str) -> String {
418        match self.config.line_break_mode {
419            LineBreakMode::Auto => self.auto_line_breaks(text),
420            LineBreakMode::PreserveAll => text.to_string(),
421            LineBreakMode::Normalize => self.normalize_line_breaks(text),
422        }
423    }
424
425    /// Auto-detect line breaks (heuristic)
426    fn auto_line_breaks(&self, text: &str) -> String {
427        let lines: Vec<&str> = text.lines().collect();
428        let mut result = String::with_capacity(text.len());
429
430        for (i, line) in lines.iter().enumerate() {
431            let trimmed = line.trim_end();
432
433            if trimmed.is_empty() {
434                result.push('\n');
435                continue;
436            }
437
438            result.push_str(line);
439
440            if i < lines.len() - 1 {
441                let next_line = lines[i + 1].trim_start();
442
443                let ends_with_punct = trimmed.ends_with('.')
444                    || trimmed.ends_with('!')
445                    || trimmed.ends_with('?')
446                    || trimmed.ends_with(':');
447
448                let next_is_empty = next_line.is_empty();
449
450                if ends_with_punct || next_is_empty {
451                    result.push('\n');
452                } else {
453                    result.push(' ');
454                }
455            }
456        }
457
458        result
459    }
460
461    /// Normalize line breaks (join hyphenated words)
462    fn normalize_line_breaks(&self, text: &str) -> String {
463        let lines: Vec<&str> = text.lines().collect();
464        let mut result = String::with_capacity(text.len());
465
466        for (i, line) in lines.iter().enumerate() {
467            let trimmed = line.trim_end();
468
469            if trimmed.is_empty() {
470                result.push('\n');
471                continue;
472            }
473
474            if trimmed.ends_with('-') && i < lines.len() - 1 {
475                let next_line = lines[i + 1].trim_start();
476                if !next_line.is_empty() {
477                    result.push_str(&trimmed[..trimmed.len() - 1]);
478                    continue;
479                }
480            }
481
482            result.push_str(line);
483
484            if i < lines.len() - 1 {
485                result.push('\n');
486            }
487        }
488
489        result
490    }
491
492    /// Get the current configuration
493    ///
494    /// # Examples
495    ///
496    /// ```
497    /// use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
498    ///
499    /// let config = PlainTextConfig::dense();
500    /// let extractor = PlainTextExtractor::with_config(config.clone());
501    /// assert_eq!(extractor.config().space_threshold, 0.1);
502    /// ```
503    pub fn config(&self) -> &PlainTextConfig {
504        &self.config
505    }
506}
507
508/// Check if a matrix is the identity matrix
509#[inline]
510fn is_identity(matrix: &[f64; 6]) -> bool {
511    matrix[0] == 1.0
512        && matrix[1] == 0.0
513        && matrix[2] == 0.0
514        && matrix[3] == 1.0
515        && matrix[4] == 0.0
516        && matrix[5] == 0.0
517}
518
519/// Multiply two 2D transformation matrices (optimized for identity)
520#[inline]
521fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
522    // Fast path: if m1 is identity, return m2
523    if is_identity(m1) {
524        return *m2;
525    }
526    // Fast path: if m2 is identity, return m1
527    if is_identity(m2) {
528        return *m1;
529    }
530
531    // Full matrix multiplication
532    [
533        m1[0] * m2[0] + m1[1] * m2[2],
534        m1[0] * m2[1] + m1[1] * m2[3],
535        m1[2] * m2[0] + m1[3] * m2[2],
536        m1[2] * m2[1] + m1[3] * m2[3],
537        m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
538        m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
539    ]
540}
541
542/// Transform a point using a transformation matrix
543#[inline]
544fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
545    let new_x = matrix[0] * x + matrix[2] * y + matrix[4];
546    let new_y = matrix[1] * x + matrix[3] * y + matrix[5];
547    (new_x, new_y)
548}
549
550#[cfg(test)]
551mod tests {
552    use super::*;
553
554    #[test]
555    fn test_new() {
556        let extractor = PlainTextExtractor::new();
557        assert_eq!(extractor.config.space_threshold, 0.3);
558    }
559
560    #[test]
561    fn test_with_config() {
562        let config = PlainTextConfig::dense();
563        let extractor = PlainTextExtractor::with_config(config.clone());
564        assert_eq!(extractor.config, config);
565    }
566
567    #[test]
568    fn test_default() {
569        let extractor = PlainTextExtractor::default();
570        assert_eq!(extractor.config, PlainTextConfig::default());
571    }
572
573    #[test]
574    fn test_normalize_line_breaks_hyphenated() {
575        let extractor = PlainTextExtractor::new();
576        let text = "This is a docu-\nment with hyphen-\nated words.";
577        let normalized = extractor.normalize_line_breaks(text);
578        assert_eq!(normalized, "This is a document with hyphenated words.");
579    }
580
581    #[test]
582    fn test_normalize_line_breaks_no_hyphen() {
583        let extractor = PlainTextExtractor::new();
584        let text = "This is a normal\ntext without\nhyphens.";
585        let normalized = extractor.normalize_line_breaks(text);
586        assert_eq!(normalized, "This is a normal\ntext without\nhyphens.");
587    }
588
589    #[test]
590    fn test_auto_line_breaks_punctuation() {
591        let extractor = PlainTextExtractor::new();
592        let text = "First sentence.\nSecond sentence.\nThird sentence.";
593        let processed = extractor.auto_line_breaks(text);
594        assert_eq!(
595            processed,
596            "First sentence.\nSecond sentence.\nThird sentence."
597        );
598    }
599
600    #[test]
601    fn test_auto_line_breaks_wrapped() {
602        let extractor = PlainTextExtractor::new();
603        let text = "This is a long line that\nwas wrapped in the PDF\nfor layout purposes";
604        let processed = extractor.auto_line_breaks(text);
605        assert!(processed.contains("long line that was"));
606        assert!(processed.contains("wrapped in the PDF for"));
607    }
608
609    #[test]
610    fn test_auto_line_breaks_empty_lines() {
611        let extractor = PlainTextExtractor::new();
612        let text = "Paragraph one.\n\nParagraph two.\n\nParagraph three.";
613        let processed = extractor.auto_line_breaks(text);
614        assert!(processed.contains("\n\n"));
615    }
616
617    #[test]
618    fn test_apply_line_break_mode_preserve_all() {
619        let extractor = PlainTextExtractor::with_config(PlainTextConfig {
620            line_break_mode: LineBreakMode::PreserveAll,
621            ..Default::default()
622        });
623        let text = "Line 1\nLine 2\nLine 3";
624        let processed = extractor.apply_line_break_mode(text);
625        assert_eq!(processed, text);
626    }
627
628    #[test]
629    fn test_apply_line_break_mode_normalize() {
630        let extractor = PlainTextExtractor::with_config(PlainTextConfig {
631            line_break_mode: LineBreakMode::Normalize,
632            ..Default::default()
633        });
634        let text = "docu-\nment";
635        let processed = extractor.apply_line_break_mode(text);
636        assert_eq!(processed, "document");
637    }
638
639    #[test]
640    fn test_apply_line_break_mode_auto() {
641        let extractor = PlainTextExtractor::with_config(PlainTextConfig {
642            line_break_mode: LineBreakMode::Auto,
643            ..Default::default()
644        });
645        let text = "First sentence.\nSecond part";
646        let processed = extractor.apply_line_break_mode(text);
647        assert!(processed.contains("First sentence.\nSecond"));
648    }
649
650    #[test]
651    fn test_config_getter() {
652        let config = PlainTextConfig::loose();
653        let extractor = PlainTextExtractor::with_config(config.clone());
654        assert_eq!(extractor.config(), &config);
655    }
656
657    #[test]
658    fn test_multiply_matrix() {
659        let m1 = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
660        let m2 = [1.0, 0.0, 0.0, 1.0, 5.0, 15.0];
661        let result = multiply_matrix(&m1, &m2);
662        assert_eq!(result, [1.0, 0.0, 0.0, 1.0, 15.0, 35.0]);
663    }
664
665    #[test]
666    fn test_transform_point() {
667        let matrix = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
668        let (x, y) = transform_point(5.0, 10.0, &matrix);
669        assert_eq!(x, 15.0);
670        assert_eq!(y, 30.0);
671    }
672}