oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::objects::PdfObject;
9use crate::parser::page_tree::ParsedPage;
10use crate::parser::ParseResult;
11use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
12use std::collections::HashMap;
13use std::io::{Read, Seek};
14
15/// Text extraction options
16#[derive(Debug, Clone)]
17pub struct ExtractionOptions {
18    /// Preserve the original layout (spacing and positioning)
19    pub preserve_layout: bool,
20    /// Minimum space width to insert space character (in text space units)
21    pub space_threshold: f64,
22    /// Minimum vertical distance to insert newline (in text space units)
23    pub newline_threshold: f64,
24    /// Sort text fragments by position (useful for multi-column layouts)
25    pub sort_by_position: bool,
26    /// Detect and handle columns
27    pub detect_columns: bool,
28    /// Column separation threshold (in page units)
29    pub column_threshold: f64,
30    /// Merge hyphenated words at line ends
31    pub merge_hyphenated: bool,
32}
33
34impl Default for ExtractionOptions {
35    fn default() -> Self {
36        Self {
37            preserve_layout: false,
38            space_threshold: 0.2,
39            newline_threshold: 10.0,
40            sort_by_position: true,
41            detect_columns: false,
42            column_threshold: 50.0,
43            merge_hyphenated: true,
44        }
45    }
46}
47
48/// Extracted text with position information
49#[derive(Debug, Clone)]
50pub struct ExtractedText {
51    /// The extracted text content
52    pub text: String,
53    /// Text fragments with position information (if preserve_layout is true)
54    pub fragments: Vec<TextFragment>,
55}
56
57/// A fragment of text with position information
58#[derive(Debug, Clone)]
59pub struct TextFragment {
60    /// Text content
61    pub text: String,
62    /// X position in page coordinates
63    pub x: f64,
64    /// Y position in page coordinates
65    pub y: f64,
66    /// Width of the text
67    pub width: f64,
68    /// Height of the text
69    pub height: f64,
70    /// Font size
71    pub font_size: f64,
72}
73
74/// Text extraction state
75struct TextState {
76    /// Current text matrix
77    text_matrix: [f64; 6],
78    /// Current text line matrix
79    text_line_matrix: [f64; 6],
80    /// Current transformation matrix (CTM)
81    #[allow(dead_code)]
82    ctm: [f64; 6],
83    /// Text leading (line spacing)
84    leading: f64,
85    /// Character spacing
86    char_space: f64,
87    /// Word spacing
88    word_space: f64,
89    /// Horizontal scaling
90    horizontal_scale: f64,
91    /// Text rise
92    text_rise: f64,
93    /// Current font size
94    font_size: f64,
95    /// Current font name
96    font_name: Option<String>,
97    /// Render mode (0 = fill, 1 = stroke, etc.)
98    render_mode: u8,
99}
100
101impl Default for TextState {
102    fn default() -> Self {
103        Self {
104            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
105            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
106            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
107            leading: 0.0,
108            char_space: 0.0,
109            word_space: 0.0,
110            horizontal_scale: 100.0,
111            text_rise: 0.0,
112            font_size: 0.0,
113            font_name: None,
114            render_mode: 0,
115        }
116    }
117}
118
119/// Text extractor for PDF pages with CMap support
120pub struct TextExtractor {
121    options: ExtractionOptions,
122    /// Font cache for the current extraction
123    font_cache: HashMap<String, FontInfo>,
124}
125
126impl TextExtractor {
127    /// Create a new text extractor with default options
128    pub fn new() -> Self {
129        Self {
130            options: ExtractionOptions::default(),
131            font_cache: HashMap::new(),
132        }
133    }
134
135    /// Create a text extractor with custom options
136    pub fn with_options(options: ExtractionOptions) -> Self {
137        Self {
138            options,
139            font_cache: HashMap::new(),
140        }
141    }
142
143    /// Extract text from a PDF document
144    pub fn extract_from_document<R: Read + Seek>(
145        &mut self,
146        document: &PdfDocument<R>,
147    ) -> ParseResult<Vec<ExtractedText>> {
148        let page_count = document.page_count()?;
149        let mut results = Vec::new();
150
151        for i in 0..page_count {
152            let text = self.extract_from_page(document, i)?;
153            results.push(text);
154        }
155
156        Ok(results)
157    }
158
159    /// Extract text from a specific page
160    pub fn extract_from_page<R: Read + Seek>(
161        &mut self,
162        document: &PdfDocument<R>,
163        page_index: u32,
164    ) -> ParseResult<ExtractedText> {
165        // Get the page
166        let page = document.get_page(page_index)?;
167
168        // Extract font resources first
169        self.extract_font_resources(&page, document)?;
170
171        // Get content streams
172        let streams = page.content_streams_with_document(document)?;
173
174        let mut extracted_text = String::new();
175        let mut fragments = Vec::new();
176        let mut state = TextState::default();
177        let mut in_text_object = false;
178        let mut last_x = 0.0;
179        let mut last_y = 0.0;
180
181        // Process each content stream
182        for stream_data in streams {
183            let operations = match ContentParser::parse_content(&stream_data) {
184                Ok(ops) => ops,
185                Err(e) => {
186                    // Log the error but continue processing other streams
187                    eprintln!("Warning: Failed to parse content stream, skipping: {}", e);
188                    continue;
189                }
190            };
191
192            for op in operations {
193                match op {
194                    ContentOperation::BeginText => {
195                        in_text_object = true;
196                        // Reset text matrix to identity
197                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
198                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
199                    }
200
201                    ContentOperation::EndText => {
202                        in_text_object = false;
203                    }
204
205                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
206                        state.text_matrix =
207                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
208                        state.text_line_matrix =
209                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
210                    }
211
212                    ContentOperation::MoveText(tx, ty) => {
213                        // Update text matrix by translation
214                        let new_matrix = multiply_matrix(
215                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
216                            &state.text_line_matrix,
217                        );
218                        state.text_matrix = new_matrix;
219                        state.text_line_matrix = new_matrix;
220                    }
221
222                    ContentOperation::NextLine => {
223                        // Move to next line using current leading
224                        let new_matrix = multiply_matrix(
225                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
226                            &state.text_line_matrix,
227                        );
228                        state.text_matrix = new_matrix;
229                        state.text_line_matrix = new_matrix;
230                    }
231
232                    ContentOperation::ShowText(text) => {
233                        if in_text_object {
234                            let text_bytes = &text;
235                            let decoded = self.decode_text(text_bytes, &state)?;
236
237                            // Calculate position
238                            let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
239
240                            // Add spacing based on position change
241                            if !extracted_text.is_empty() {
242                                let dx = x - last_x;
243                                let dy = (y - last_y).abs();
244
245                                if dy > self.options.newline_threshold {
246                                    extracted_text.push('\n');
247                                } else if dx > self.options.space_threshold * state.font_size {
248                                    extracted_text.push(' ');
249                                }
250                            }
251
252                            extracted_text.push_str(&decoded);
253
254                            if self.options.preserve_layout {
255                                fragments.push(TextFragment {
256                                    text: decoded.clone(),
257                                    x,
258                                    y,
259                                    width: calculate_text_width(&decoded, state.font_size),
260                                    height: state.font_size,
261                                    font_size: state.font_size,
262                                });
263                            }
264
265                            // Update position for next text
266                            last_x = x + calculate_text_width(&decoded, state.font_size);
267                            last_y = y;
268
269                            // Update text matrix for next show operation
270                            let text_width = calculate_text_width(&decoded, state.font_size);
271                            let tx = text_width * state.horizontal_scale / 100.0;
272                            state.text_matrix =
273                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
274                        }
275                    }
276
277                    ContentOperation::ShowTextArray(array) => {
278                        if in_text_object {
279                            for item in array {
280                                match item {
281                                    TextElement::Text(text_bytes) => {
282                                        let decoded = self.decode_text(&text_bytes, &state)?;
283                                        extracted_text.push_str(&decoded);
284
285                                        // Update text matrix
286                                        let text_width =
287                                            calculate_text_width(&decoded, state.font_size);
288                                        let tx = text_width * state.horizontal_scale / 100.0;
289                                        state.text_matrix = multiply_matrix(
290                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
291                                            &state.text_matrix,
292                                        );
293                                    }
294                                    TextElement::Spacing(adjustment) => {
295                                        // Text position adjustment (negative = move left)
296                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
297                                        state.text_matrix = multiply_matrix(
298                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
299                                            &state.text_matrix,
300                                        );
301                                    }
302                                }
303                            }
304                        }
305                    }
306
307                    ContentOperation::SetFont(name, size) => {
308                        state.font_name = Some(name);
309                        state.font_size = size as f64;
310                    }
311
312                    ContentOperation::SetLeading(leading) => {
313                        state.leading = leading as f64;
314                    }
315
316                    ContentOperation::SetCharSpacing(spacing) => {
317                        state.char_space = spacing as f64;
318                    }
319
320                    ContentOperation::SetWordSpacing(spacing) => {
321                        state.word_space = spacing as f64;
322                    }
323
324                    ContentOperation::SetHorizontalScaling(scale) => {
325                        state.horizontal_scale = scale as f64;
326                    }
327
328                    ContentOperation::SetTextRise(rise) => {
329                        state.text_rise = rise as f64;
330                    }
331
332                    ContentOperation::SetTextRenderMode(mode) => {
333                        state.render_mode = mode as u8;
334                    }
335
336                    _ => {
337                        // Other operations don't affect text extraction
338                    }
339                }
340            }
341        }
342
343        // Sort and process fragments if requested
344        if self.options.sort_by_position && !fragments.is_empty() {
345            self.sort_and_merge_fragments(&mut fragments);
346        }
347
348        // Reconstruct text from sorted fragments if layout is preserved
349        if self.options.preserve_layout && !fragments.is_empty() {
350            extracted_text = self.reconstruct_text_from_fragments(&fragments);
351        }
352
353        Ok(ExtractedText {
354            text: extracted_text,
355            fragments,
356        })
357    }
358
359    /// Sort text fragments by position and merge them appropriately
360    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
361        // Sort fragments by Y position (top to bottom) then X position (left to right)
362        fragments.sort_by(|a, b| {
363            // First compare Y position (with threshold for same line)
364            let y_diff = (b.y - a.y).abs();
365            if y_diff < self.options.newline_threshold {
366                // Same line, sort by X position
367                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
368            } else {
369                // Different lines, sort by Y (inverted because PDF Y increases upward)
370                b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
371            }
372        });
373
374        // Detect columns if requested
375        if self.options.detect_columns {
376            self.detect_and_sort_columns(fragments);
377        }
378    }
379
380    /// Detect columns and re-sort fragments accordingly
381    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
382        // Group fragments by approximate Y position
383        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
384        let mut current_line: Vec<&mut TextFragment> = Vec::new();
385        let mut last_y = f64::INFINITY;
386
387        for fragment in fragments.iter_mut() {
388            let fragment_y = fragment.y;
389            if (last_y - fragment_y).abs() > self.options.newline_threshold
390                && !current_line.is_empty()
391            {
392                lines.push(current_line);
393                current_line = Vec::new();
394            }
395            current_line.push(fragment);
396            last_y = fragment_y;
397        }
398        if !current_line.is_empty() {
399            lines.push(current_line);
400        }
401
402        // Detect column boundaries
403        let mut column_boundaries = vec![0.0];
404        for line in &lines {
405            if line.len() > 1 {
406                for i in 0..line.len() - 1 {
407                    let gap = line[i + 1].x - (line[i].x + line[i].width);
408                    if gap > self.options.column_threshold {
409                        let boundary = line[i].x + line[i].width + gap / 2.0;
410                        if !column_boundaries
411                            .iter()
412                            .any(|&b| (b - boundary).abs() < 10.0)
413                        {
414                            column_boundaries.push(boundary);
415                        }
416                    }
417                }
418            }
419        }
420        column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
421
422        // Re-sort fragments by column then Y position
423        if column_boundaries.len() > 1 {
424            fragments.sort_by(|a, b| {
425                // Determine column for each fragment
426                let col_a = column_boundaries
427                    .iter()
428                    .position(|&boundary| a.x < boundary)
429                    .unwrap_or(column_boundaries.len())
430                    - 1;
431                let col_b = column_boundaries
432                    .iter()
433                    .position(|&boundary| b.x < boundary)
434                    .unwrap_or(column_boundaries.len())
435                    - 1;
436
437                if col_a != col_b {
438                    col_a.cmp(&col_b)
439                } else {
440                    // Same column, sort by Y position
441                    b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
442                }
443            });
444        }
445    }
446
447    /// Reconstruct text from sorted fragments
448    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
449        let mut result = String::new();
450        let mut last_y = f64::INFINITY;
451        let mut last_x = 0.0;
452        let mut last_line_ended_with_hyphen = false;
453
454        for fragment in fragments {
455            // Check if we need a newline
456            let y_diff = (last_y - fragment.y).abs();
457            if !result.is_empty() && y_diff > self.options.newline_threshold {
458                // Handle hyphenation
459                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
460                    // Remove the hyphen and don't add newline
461                    if result.ends_with('-') {
462                        result.pop();
463                    }
464                } else {
465                    result.push('\n');
466                }
467            } else if !result.is_empty() {
468                // Check if we need a space
469                let x_gap = fragment.x - last_x;
470                if x_gap > self.options.space_threshold * fragment.font_size {
471                    result.push(' ');
472                }
473            }
474
475            result.push_str(&fragment.text);
476            last_line_ended_with_hyphen = fragment.text.ends_with('-');
477            last_y = fragment.y;
478            last_x = fragment.x + fragment.width;
479        }
480
481        result
482    }
483
484    /// Extract font resources from page
485    fn extract_font_resources<R: Read + Seek>(
486        &mut self,
487        page: &ParsedPage,
488        document: &PdfDocument<R>,
489    ) -> ParseResult<()> {
490        // Clear previous font cache
491        self.font_cache.clear();
492
493        // Get page resources
494        if let Some(resources) = page.get_resources() {
495            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
496                // Extract each font
497                for (font_name, font_obj) in font_dict.0.iter() {
498                    if let Some(font_ref) = font_obj.as_reference() {
499                        if let Ok(PdfObject::Dictionary(font_dict)) =
500                            document.get_object(font_ref.0, font_ref.1)
501                        {
502                            // Create a CMap extractor to use its font extraction logic
503                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
504
505                            if let Ok(font_info) =
506                                cmap_extractor.extract_font_info(&font_dict, document)
507                            {
508                                self.font_cache.insert(font_name.0.clone(), font_info);
509                                tracing::debug!(
510                                    "Cached font: {} -> {:?}",
511                                    font_name.0,
512                                    self.font_cache.get(&font_name.0)
513                                );
514                            }
515                        }
516                    }
517                }
518            }
519        }
520
521        Ok(())
522    }
523
524    /// Decode text using the current font encoding and ToUnicode mapping
525    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
526        use crate::text::encoding::TextEncoding;
527
528        // First, try to use cached font information with ToUnicode CMap
529        if let Some(ref font_name) = state.font_name {
530            if let Some(font_info) = self.font_cache.get(font_name) {
531                // Create a temporary CMapTextExtractor to use its decoding logic
532                let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
533
534                // Try CMap-based decoding first
535                if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
536                    tracing::debug!(
537                        "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
538                        font_name,
539                        text,
540                        decoded
541                    );
542                    return Ok(decoded);
543                }
544
545                tracing::debug!(
546                    "CMap decoding failed for font {}, falling back to encoding",
547                    font_name
548                );
549            }
550        }
551
552        // Fall back to encoding-based decoding
553        let encoding = if let Some(ref font_name) = state.font_name {
554            match font_name.to_lowercase().as_str() {
555                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
556                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
557                name if name.contains("standard") => TextEncoding::StandardEncoding,
558                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
559                _ => {
560                    // Default based on common patterns
561                    if font_name.starts_with("Times")
562                        || font_name.starts_with("Helvetica")
563                        || font_name.starts_with("Courier")
564                    {
565                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
566                    } else {
567                        TextEncoding::PdfDocEncoding // Safe default
568                    }
569                }
570            }
571        } else {
572            TextEncoding::WinAnsiEncoding // Default for most PDFs
573        };
574
575        let fallback_result = encoding.decode(text);
576        tracing::debug!(
577            "Fallback encoding decoding: {:?} -> \"{}\"",
578            text,
579            fallback_result
580        );
581        Ok(fallback_result)
582    }
583}
584
585impl Default for TextExtractor {
586    fn default() -> Self {
587        Self::new()
588    }
589}
590
591/// Multiply two transformation matrices
592fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
593    [
594        a[0] * b[0] + a[1] * b[2],
595        a[0] * b[1] + a[1] * b[3],
596        a[2] * b[0] + a[3] * b[2],
597        a[2] * b[1] + a[3] * b[3],
598        a[4] * b[0] + a[5] * b[2] + b[4],
599        a[4] * b[1] + a[5] * b[3] + b[5],
600    ]
601}
602
603/// Transform a point using a transformation matrix
604fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
605    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
606    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
607    (tx, ty)
608}
609
610/// Calculate approximate text width (simplified)
611fn calculate_text_width(text: &str, font_size: f64) -> f64 {
612    // Approximate: assume average character width is 0.5 * font_size
613    text.len() as f64 * font_size * 0.5
614}
615
616#[cfg(test)]
617mod tests {
618    use super::*;
619
620    #[test]
621    fn test_matrix_multiplication() {
622        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
623        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
624
625        let result = multiply_matrix(&identity, &translation);
626        assert_eq!(result, translation);
627
628        let result2 = multiply_matrix(&translation, &identity);
629        assert_eq!(result2, translation);
630    }
631
632    #[test]
633    fn test_transform_point() {
634        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
635        let (x, y) = transform_point(5.0, 5.0, &translation);
636        assert_eq!(x, 15.0);
637        assert_eq!(y, 25.0);
638    }
639
640    #[test]
641    fn test_extraction_options_default() {
642        let options = ExtractionOptions::default();
643        assert!(!options.preserve_layout);
644        assert_eq!(options.space_threshold, 0.2);
645        assert_eq!(options.newline_threshold, 10.0);
646        assert!(options.sort_by_position);
647        assert!(!options.detect_columns);
648        assert_eq!(options.column_threshold, 50.0);
649        assert!(options.merge_hyphenated);
650    }
651
652    #[test]
653    fn test_extraction_options_custom() {
654        let options = ExtractionOptions {
655            preserve_layout: true,
656            space_threshold: 0.5,
657            newline_threshold: 15.0,
658            sort_by_position: false,
659            detect_columns: true,
660            column_threshold: 75.0,
661            merge_hyphenated: false,
662        };
663        assert!(options.preserve_layout);
664        assert_eq!(options.space_threshold, 0.5);
665        assert_eq!(options.newline_threshold, 15.0);
666        assert!(!options.sort_by_position);
667        assert!(options.detect_columns);
668        assert_eq!(options.column_threshold, 75.0);
669        assert!(!options.merge_hyphenated);
670    }
671
672    #[test]
673    fn test_text_fragment() {
674        let fragment = TextFragment {
675            text: "Hello".to_string(),
676            x: 100.0,
677            y: 200.0,
678            width: 50.0,
679            height: 12.0,
680            font_size: 10.0,
681        };
682        assert_eq!(fragment.text, "Hello");
683        assert_eq!(fragment.x, 100.0);
684        assert_eq!(fragment.y, 200.0);
685        assert_eq!(fragment.width, 50.0);
686        assert_eq!(fragment.height, 12.0);
687        assert_eq!(fragment.font_size, 10.0);
688    }
689
690    #[test]
691    fn test_extracted_text() {
692        let fragments = vec![
693            TextFragment {
694                text: "Hello".to_string(),
695                x: 100.0,
696                y: 200.0,
697                width: 50.0,
698                height: 12.0,
699                font_size: 10.0,
700            },
701            TextFragment {
702                text: "World".to_string(),
703                x: 160.0,
704                y: 200.0,
705                width: 50.0,
706                height: 12.0,
707                font_size: 10.0,
708            },
709        ];
710
711        let extracted = ExtractedText {
712            text: "Hello World".to_string(),
713            fragments: fragments.clone(),
714        };
715
716        assert_eq!(extracted.text, "Hello World");
717        assert_eq!(extracted.fragments.len(), 2);
718        assert_eq!(extracted.fragments[0].text, "Hello");
719        assert_eq!(extracted.fragments[1].text, "World");
720    }
721
722    #[test]
723    fn test_text_state_default() {
724        let state = TextState::default();
725        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
726        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
727        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
728        assert_eq!(state.leading, 0.0);
729        assert_eq!(state.char_space, 0.0);
730        assert_eq!(state.word_space, 0.0);
731        assert_eq!(state.horizontal_scale, 100.0);
732        assert_eq!(state.text_rise, 0.0);
733        assert_eq!(state.font_size, 0.0);
734        assert!(state.font_name.is_none());
735        assert_eq!(state.render_mode, 0);
736    }
737
738    #[test]
739    fn test_matrix_operations() {
740        // Test rotation matrix
741        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
742        let (x, y) = transform_point(1.0, 0.0, &rotation);
743        assert_eq!(x, 0.0);
744        assert_eq!(y, 1.0);
745
746        // Test scaling matrix
747        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
748        let (x, y) = transform_point(5.0, 5.0, &scale);
749        assert_eq!(x, 10.0);
750        assert_eq!(y, 15.0);
751
752        // Test complex transformation
753        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
754        let (x, y) = transform_point(1.0, 1.0, &complex);
755        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
756        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
757    }
758
759    #[test]
760    fn test_text_extractor_new() {
761        let extractor = TextExtractor::new();
762        let options = extractor.options;
763        assert!(!options.preserve_layout);
764        assert_eq!(options.space_threshold, 0.2);
765        assert_eq!(options.newline_threshold, 10.0);
766        assert!(options.sort_by_position);
767        assert!(!options.detect_columns);
768        assert_eq!(options.column_threshold, 50.0);
769        assert!(options.merge_hyphenated);
770    }
771
772    #[test]
773    fn test_text_extractor_with_options() {
774        let options = ExtractionOptions {
775            preserve_layout: true,
776            space_threshold: 0.3,
777            newline_threshold: 12.0,
778            sort_by_position: false,
779            detect_columns: true,
780            column_threshold: 60.0,
781            merge_hyphenated: false,
782        };
783        let extractor = TextExtractor::with_options(options.clone());
784        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
785        assert_eq!(extractor.options.space_threshold, options.space_threshold);
786        assert_eq!(
787            extractor.options.newline_threshold,
788            options.newline_threshold
789        );
790        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
791        assert_eq!(extractor.options.detect_columns, options.detect_columns);
792        assert_eq!(extractor.options.column_threshold, options.column_threshold);
793        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
794    }
795}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs