Skip to main content

oxidize_pdf/text/plaintext/
extractor.rs

1//! Plain text extractor implementation with simplified API
2//!
3//! This module provides simplified text extraction that returns clean strings
4//! instead of position-annotated fragments.
5
6use super::types::{LineBreakMode, PlainTextConfig, PlainTextResult};
7use crate::parser::content::{ContentOperation, ContentParser};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::encoding::TextEncoding;
13use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
14use std::collections::HashMap;
15use std::io::{Read, Seek};
16
17/// Identity transformation matrix
18const IDENTITY: [f64; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
19
20/// Text state for PDF text rendering
21#[derive(Debug, Clone)]
22struct TextState {
23    text_matrix: [f64; 6],
24    text_line_matrix: [f64; 6],
25    leading: f64,
26    font_size: f64,
27    font_name: Option<String>,
28}
29
30impl Default for TextState {
31    fn default() -> Self {
32        Self {
33            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
34            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
35            leading: 0.0,
36            font_size: 0.0,
37            font_name: None,
38        }
39    }
40}
41
42/// Plain text extractor with simplified API
43///
44/// Extracts text from PDF pages without maintaining position information,
45/// providing a simpler API by returning `String` and `Vec<String>` instead
46/// of `Vec<TextFragment>`.
47///
48/// # Architecture
49///
50/// This extractor uses the same content stream parser as `TextExtractor`,
51/// but discards position metadata to provide a simpler output format. It
52/// tracks minimal position data (x, y coordinates) to determine spacing
53/// and line breaks, then returns clean text strings.
54///
55/// # Performance Characteristics
56///
57/// - **Memory**: O(1) position tracking vs O(n) fragments
58/// - **CPU**: No fragment sorting, no width calculations
59/// - **Performance**: Comparable to `TextExtractor` (same parser)
60///
61/// # Thread Safety
62///
63/// `PlainTextExtractor` is thread-safe and can be reused across multiple
64/// pages and documents. Create once, use many times.
65///
66/// # Examples
67///
68/// ## Basic Usage
69///
70/// ```no_run
71/// use oxidize_pdf::parser::PdfReader;
72/// use oxidize_pdf::text::plaintext::PlainTextExtractor;
73///
74/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
75/// let doc = PdfReader::open_document("document.pdf")?;
76///
77/// let mut extractor = PlainTextExtractor::new();
78/// let result = extractor.extract(&doc, 0)?;
79///
80/// println!("{}", result.text);
81/// # Ok(())
82/// # }
83/// ```
84///
85/// ## Custom Configuration
86///
87/// ```no_run
88/// use oxidize_pdf::parser::PdfReader;
89/// use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
90///
91/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
92/// let doc = PdfReader::open_document("document.pdf")?;
93///
94/// let config = PlainTextConfig {
95///     space_threshold: 0.3,
96///     newline_threshold: 12.0,
97///     preserve_layout: true,
98///     line_break_mode: oxidize_pdf::text::plaintext::LineBreakMode::Normalize,
99/// };
100///
101/// let mut extractor = PlainTextExtractor::with_config(config);
102/// let result = extractor.extract(&doc, 0)?;
103/// # Ok(())
104/// # }
105/// ```
106pub struct PlainTextExtractor {
107    /// Configuration for extraction
108    config: PlainTextConfig,
109    /// Font cache for decoding text
110    font_cache: HashMap<String, FontInfo>,
111}
112
113impl Default for PlainTextExtractor {
114    fn default() -> Self {
115        Self::new()
116    }
117}
118
119impl PlainTextExtractor {
120    /// Create a new extractor with default configuration
121    ///
122    /// # Examples
123    ///
124    /// ```
125    /// use oxidize_pdf::text::plaintext::PlainTextExtractor;
126    ///
127    /// let extractor = PlainTextExtractor::new();
128    /// ```
129    pub fn new() -> Self {
130        Self {
131            config: PlainTextConfig::default(),
132            font_cache: HashMap::new(),
133        }
134    }
135
136    /// Create a new extractor with custom configuration
137    ///
138    /// # Examples
139    ///
140    /// ```
141    /// use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
142    ///
143    /// let config = PlainTextConfig::dense();
144    /// let extractor = PlainTextExtractor::with_config(config);
145    /// ```
146    pub fn with_config(config: PlainTextConfig) -> Self {
147        Self {
148            config,
149            font_cache: HashMap::new(),
150        }
151    }
152
153    /// Extract plain text from a PDF page
154    ///
155    /// Returns text with spaces and newlines inserted according to the
156    /// configured thresholds. Position information is not included in
157    /// the result.
158    ///
159    /// # Output
160    ///
161    /// Returns a `PlainTextResult` containing the extracted text as a `String`,
162    /// along with character count and line count metadata. This is simpler than
163    /// `TextExtractor` which returns `Vec<TextFragment>` with position data.
164    ///
165    /// # Examples
166    ///
167    /// ```no_run
168    /// use oxidize_pdf::parser::PdfReader;
169    /// use oxidize_pdf::text::plaintext::PlainTextExtractor;
170    ///
171    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
172    /// let doc = PdfReader::open_document("document.pdf")?;
173    ///
174    /// let mut extractor = PlainTextExtractor::new();
175    /// let result = extractor.extract(&doc, 0)?; // page index 0 = first page
176    ///
177    /// println!("Extracted {} characters", result.char_count);
178    /// # Ok(())
179    /// # }
180    /// ```
181    pub fn extract<R: Read + Seek>(
182        &mut self,
183        document: &PdfDocument<R>,
184        page_index: u32,
185    ) -> ParseResult<PlainTextResult> {
186        // Get the page
187        let page = document.get_page(page_index)?;
188
189        // Extract font resources
190        self.extract_font_resources(&page, document)?;
191
192        // Get content streams
193        let streams = page.content_streams_with_document(document)?;
194
195        // Pre-allocate String capacity to avoid reallocations
196        let mut extracted_text = String::with_capacity(4096);
197        let mut state = TextState::default();
198        let mut in_text_object = false;
199        let mut last_x = 0.0;
200        let mut last_y = 0.0;
201
202        // Process each content stream
203        for stream_data in streams {
204            let operations = match ContentParser::parse_content(&stream_data) {
205                Ok(ops) => ops,
206                Err(e) => {
207                    tracing::debug!("Warning: Failed to parse content stream, skipping: {}", e);
208                    continue;
209                }
210            };
211
212            for op in operations {
213                match op {
214                    ContentOperation::BeginText => {
215                        in_text_object = true;
216                        state.text_matrix = IDENTITY;
217                        state.text_line_matrix = IDENTITY;
218                    }
219
220                    ContentOperation::EndText => {
221                        in_text_object = false;
222                    }
223
224                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
225                        state.text_matrix =
226                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
227                        state.text_line_matrix =
228                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
229                    }
230
231                    ContentOperation::MoveText(tx, ty) => {
232                        let new_matrix = multiply_matrix(
233                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
234                            &state.text_line_matrix,
235                        );
236                        state.text_matrix = new_matrix;
237                        state.text_line_matrix = new_matrix;
238                    }
239
240                    ContentOperation::NextLine => {
241                        let new_matrix = multiply_matrix(
242                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
243                            &state.text_line_matrix,
244                        );
245                        state.text_matrix = new_matrix;
246                        state.text_line_matrix = new_matrix;
247                    }
248
249                    ContentOperation::ShowText(text) => {
250                        if in_text_object {
251                            let decoded = self.decode_text::<R>(&text, &state)?;
252
253                            // Calculate position (only x, y - no width/height needed)
254                            let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
255
256                            // Add spacing based on position change
257                            if !extracted_text.is_empty() {
258                                let dx = x - last_x;
259                                let dy = (y - last_y).abs();
260
261                                if dy > self.config.newline_threshold {
262                                    extracted_text.push('\n');
263                                } else if dx > self.config.space_threshold * state.font_size {
264                                    extracted_text.push(' ');
265                                }
266                            }
267
268                            extracted_text.push_str(&decoded);
269                            last_x = x;
270                            last_y = y;
271                        }
272                    }
273
274                    ContentOperation::SetFont(name, size) => {
275                        state.font_name = Some(name);
276                        state.font_size = size as f64;
277                    }
278
279                    ContentOperation::SetLeading(leading) => {
280                        state.leading = leading as f64;
281                    }
282
283                    _ => {
284                        // Ignore other operations (no graphics state needed for text extraction)
285                    }
286                }
287            }
288        }
289
290        // Apply line break mode processing
291        let processed_text = self.apply_line_break_mode(&extracted_text);
292
293        Ok(PlainTextResult::new(processed_text))
294    }
295
296    /// Extract text as individual lines
297    ///
298    /// Returns a vector of strings, one for each line detected in the page.
299    /// Useful for grep-like operations or line-based processing.
300    ///
301    /// # Examples
302    ///
303    /// ```no_run
304    /// use oxidize_pdf::parser::PdfReader;
305    /// use oxidize_pdf::text::plaintext::PlainTextExtractor;
306    ///
307    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
308    /// let doc = PdfReader::open_document("document.pdf")?;
309    ///
310    /// let mut extractor = PlainTextExtractor::new();
311    /// let lines = extractor.extract_lines(&doc, 0)?;
312    ///
313    /// for (i, line) in lines.iter().enumerate() {
314    ///     println!("{}: {}", i + 1, line);
315    /// }
316    /// # Ok(())
317    /// # }
318    /// ```
319    pub fn extract_lines<R: Read + Seek>(
320        &mut self,
321        document: &PdfDocument<R>,
322        page_index: u32,
323    ) -> ParseResult<Vec<String>> {
324        let result = self.extract(document, page_index)?;
325
326        Ok(result.text.lines().map(|line| line.to_string()).collect())
327    }
328
329    /// Extract font resources from the page
330    fn extract_font_resources<R: Read + Seek>(
331        &mut self,
332        page: &ParsedPage,
333        document: &PdfDocument<R>,
334    ) -> ParseResult<()> {
335        // Cache fonts persistently across pages (improves multi-page extraction)
336        // Font cache is only cleared when extractor is recreated
337
338        // Get page resources
339        if let Some(resources) = page.get_resources() {
340            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
341                // Extract each font
342                for (font_name, font_obj) in font_dict.0.iter() {
343                    if let Some(font_ref) = font_obj.as_reference() {
344                        if let Ok(PdfObject::Dictionary(font_dict)) =
345                            document.get_object(font_ref.0, font_ref.1)
346                        {
347                            // Create a CMap extractor to use its font extraction logic
348                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
349
350                            if let Ok(font_info) =
351                                cmap_extractor.extract_font_info(&font_dict, document)
352                            {
353                                self.font_cache.insert(font_name.0.clone(), font_info);
354                            }
355                        }
356                    }
357                }
358            }
359        }
360
361        Ok(())
362    }
363
364    /// Decode text using CMap if available
365    fn decode_text<R: Read + Seek>(
366        &self,
367        text_bytes: &[u8],
368        state: &TextState,
369    ) -> ParseResult<String> {
370        // Try CMap-based decoding first (free function — no allocation)
371        if let Some(ref font_name) = state.font_name {
372            if let Some(font_info) = self.font_cache.get(font_name) {
373                if let Ok(decoded) =
374                    crate::text::extraction_cmap::decode_text_with_font(text_bytes, font_info)
375                {
376                    return Ok(decoded);
377                }
378            }
379        }
380
381        // Fallback to encoding-based decoding (avoid allocation with case-insensitive check)
382        let encoding = if let Some(ref font_name) = state.font_name {
383            // Check for encoding type without allocating lowercase string
384            let font_lower = font_name.as_bytes();
385            if font_lower
386                .iter()
387                .any(|&b| b.to_ascii_lowercase() == b'r' && font_name.contains("roman"))
388            {
389                TextEncoding::MacRomanEncoding
390            } else if font_name.contains("WinAnsi") || font_name.contains("winansi") {
391                TextEncoding::WinAnsiEncoding
392            } else if font_name.contains("Standard") || font_name.contains("standard") {
393                TextEncoding::StandardEncoding
394            } else if font_name.contains("PdfDoc") || font_name.contains("pdfdoc") {
395                TextEncoding::PdfDocEncoding
396            } else if font_name.starts_with("Times")
397                || font_name.starts_with("Helvetica")
398                || font_name.starts_with("Courier")
399            {
400                TextEncoding::WinAnsiEncoding
401            } else {
402                TextEncoding::PdfDocEncoding
403            }
404        } else {
405            TextEncoding::WinAnsiEncoding
406        };
407
408        Ok(encoding.decode(text_bytes))
409    }
410
411    /// Apply line break mode processing
412    fn apply_line_break_mode(&self, text: &str) -> String {
413        match self.config.line_break_mode {
414            LineBreakMode::Auto => self.auto_line_breaks(text),
415            LineBreakMode::PreserveAll => text.to_string(),
416            LineBreakMode::Normalize => self.normalize_line_breaks(text),
417        }
418    }
419
420    /// Auto-detect line breaks (heuristic)
421    fn auto_line_breaks(&self, text: &str) -> String {
422        let lines: Vec<&str> = text.lines().collect();
423        let mut result = String::with_capacity(text.len());
424
425        for (i, line) in lines.iter().enumerate() {
426            let trimmed = line.trim_end();
427
428            if trimmed.is_empty() {
429                result.push('\n');
430                continue;
431            }
432
433            result.push_str(line);
434
435            if i < lines.len() - 1 {
436                let next_line = lines[i + 1].trim_start();
437
438                let ends_with_punct = trimmed.ends_with('.')
439                    || trimmed.ends_with('!')
440                    || trimmed.ends_with('?')
441                    || trimmed.ends_with(':');
442
443                let next_is_empty = next_line.is_empty();
444
445                if ends_with_punct || next_is_empty {
446                    result.push('\n');
447                } else {
448                    result.push(' ');
449                }
450            }
451        }
452
453        result
454    }
455
456    /// Normalize line breaks (join hyphenated words)
457    fn normalize_line_breaks(&self, text: &str) -> String {
458        let lines: Vec<&str> = text.lines().collect();
459        let mut result = String::with_capacity(text.len());
460
461        for (i, line) in lines.iter().enumerate() {
462            let trimmed = line.trim_end();
463
464            if trimmed.is_empty() {
465                result.push('\n');
466                continue;
467            }
468
469            if trimmed.ends_with('-') && i < lines.len() - 1 {
470                let next_line = lines[i + 1].trim_start();
471                if !next_line.is_empty() {
472                    result.push_str(&trimmed[..trimmed.len() - 1]);
473                    continue;
474                }
475            }
476
477            result.push_str(line);
478
479            if i < lines.len() - 1 {
480                result.push('\n');
481            }
482        }
483
484        result
485    }
486
487    /// Get the current configuration
488    ///
489    /// # Examples
490    ///
491    /// ```
492    /// use oxidize_pdf::text::plaintext::{PlainTextExtractor, PlainTextConfig};
493    ///
494    /// let config = PlainTextConfig::dense();
495    /// let extractor = PlainTextExtractor::with_config(config.clone());
496    /// assert_eq!(extractor.config().space_threshold, 0.1);
497    /// ```
498    pub fn config(&self) -> &PlainTextConfig {
499        &self.config
500    }
501}
502
503/// Check if a matrix is the identity matrix
504#[inline]
505fn is_identity(matrix: &[f64; 6]) -> bool {
506    matrix[0] == 1.0
507        && matrix[1] == 0.0
508        && matrix[2] == 0.0
509        && matrix[3] == 1.0
510        && matrix[4] == 0.0
511        && matrix[5] == 0.0
512}
513
514/// Multiply two 2D transformation matrices (optimized for identity)
515#[inline]
516fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
517    // Fast path: if m1 is identity, return m2
518    if is_identity(m1) {
519        return *m2;
520    }
521    // Fast path: if m2 is identity, return m1
522    if is_identity(m2) {
523        return *m1;
524    }
525
526    // Full matrix multiplication
527    [
528        m1[0] * m2[0] + m1[1] * m2[2],
529        m1[0] * m2[1] + m1[1] * m2[3],
530        m1[2] * m2[0] + m1[3] * m2[2],
531        m1[2] * m2[1] + m1[3] * m2[3],
532        m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
533        m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
534    ]
535}
536
537/// Transform a point using a transformation matrix
538#[inline]
539fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
540    let new_x = matrix[0] * x + matrix[2] * y + matrix[4];
541    let new_y = matrix[1] * x + matrix[3] * y + matrix[5];
542    (new_x, new_y)
543}
544
545#[cfg(test)]
546mod tests {
547    use super::*;
548
549    #[test]
550    fn test_new() {
551        let extractor = PlainTextExtractor::new();
552        assert_eq!(extractor.config.space_threshold, 0.3);
553    }
554
555    #[test]
556    fn test_with_config() {
557        let config = PlainTextConfig::dense();
558        let extractor = PlainTextExtractor::with_config(config.clone());
559        assert_eq!(extractor.config, config);
560    }
561
562    #[test]
563    fn test_default() {
564        let extractor = PlainTextExtractor::default();
565        assert_eq!(extractor.config, PlainTextConfig::default());
566    }
567
568    #[test]
569    fn test_normalize_line_breaks_hyphenated() {
570        let extractor = PlainTextExtractor::new();
571        let text = "This is a docu-\nment with hyphen-\nated words.";
572        let normalized = extractor.normalize_line_breaks(text);
573        assert_eq!(normalized, "This is a document with hyphenated words.");
574    }
575
576    #[test]
577    fn test_normalize_line_breaks_no_hyphen() {
578        let extractor = PlainTextExtractor::new();
579        let text = "This is a normal\ntext without\nhyphens.";
580        let normalized = extractor.normalize_line_breaks(text);
581        assert_eq!(normalized, "This is a normal\ntext without\nhyphens.");
582    }
583
584    #[test]
585    fn test_auto_line_breaks_punctuation() {
586        let extractor = PlainTextExtractor::new();
587        let text = "First sentence.\nSecond sentence.\nThird sentence.";
588        let processed = extractor.auto_line_breaks(text);
589        assert_eq!(
590            processed,
591            "First sentence.\nSecond sentence.\nThird sentence."
592        );
593    }
594
595    #[test]
596    fn test_auto_line_breaks_wrapped() {
597        let extractor = PlainTextExtractor::new();
598        let text = "This is a long line that\nwas wrapped in the PDF\nfor layout purposes";
599        let processed = extractor.auto_line_breaks(text);
600        assert!(processed.contains("long line that was"));
601        assert!(processed.contains("wrapped in the PDF for"));
602    }
603
604    #[test]
605    fn test_auto_line_breaks_empty_lines() {
606        let extractor = PlainTextExtractor::new();
607        let text = "Paragraph one.\n\nParagraph two.\n\nParagraph three.";
608        let processed = extractor.auto_line_breaks(text);
609        assert!(processed.contains("\n\n"));
610    }
611
612    #[test]
613    fn test_apply_line_break_mode_preserve_all() {
614        let extractor = PlainTextExtractor::with_config(PlainTextConfig {
615            line_break_mode: LineBreakMode::PreserveAll,
616            ..Default::default()
617        });
618        let text = "Line 1\nLine 2\nLine 3";
619        let processed = extractor.apply_line_break_mode(text);
620        assert_eq!(processed, text);
621    }
622
623    #[test]
624    fn test_apply_line_break_mode_normalize() {
625        let extractor = PlainTextExtractor::with_config(PlainTextConfig {
626            line_break_mode: LineBreakMode::Normalize,
627            ..Default::default()
628        });
629        let text = "docu-\nment";
630        let processed = extractor.apply_line_break_mode(text);
631        assert_eq!(processed, "document");
632    }
633
634    #[test]
635    fn test_apply_line_break_mode_auto() {
636        let extractor = PlainTextExtractor::with_config(PlainTextConfig {
637            line_break_mode: LineBreakMode::Auto,
638            ..Default::default()
639        });
640        let text = "First sentence.\nSecond part";
641        let processed = extractor.apply_line_break_mode(text);
642        assert!(processed.contains("First sentence.\nSecond"));
643    }
644
645    #[test]
646    fn test_config_getter() {
647        let config = PlainTextConfig::loose();
648        let extractor = PlainTextExtractor::with_config(config.clone());
649        assert_eq!(extractor.config(), &config);
650    }
651
652    #[test]
653    fn test_multiply_matrix() {
654        let m1 = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
655        let m2 = [1.0, 0.0, 0.0, 1.0, 5.0, 15.0];
656        let result = multiply_matrix(&m1, &m2);
657        assert_eq!(result, [1.0, 0.0, 0.0, 1.0, 15.0, 35.0]);
658    }
659
660    #[test]
661    fn test_transform_point() {
662        let matrix = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
663        let (x, y) = transform_point(5.0, 10.0, &matrix);
664        assert_eq!(x, 15.0);
665        assert_eq!(y, 30.0);
666    }
667}