oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19    /// Preserve the original layout (spacing and positioning)
20    pub preserve_layout: bool,
21    /// Minimum space width to insert space character (in text space units)
22    pub space_threshold: f64,
23    /// Minimum vertical distance to insert newline (in text space units)
24    pub newline_threshold: f64,
25    /// Sort text fragments by position (useful for multi-column layouts)
26    pub sort_by_position: bool,
27    /// Detect and handle columns
28    pub detect_columns: bool,
29    /// Column separation threshold (in page units)
30    pub column_threshold: f64,
31    /// Merge hyphenated words at line ends
32    pub merge_hyphenated: bool,
33    /// Track space insertion decisions in each TextFragment (default: false).
34    /// When false: zero overhead. When true: populates `TextFragment::space_decisions`.
35    pub track_space_decisions: bool,
36}
37
38impl Default for ExtractionOptions {
39    fn default() -> Self {
40        Self {
41            preserve_layout: false,
42            space_threshold: 0.3,
43            newline_threshold: 10.0,
44            sort_by_position: true,
45            detect_columns: false,
46            column_threshold: 50.0,
47            merge_hyphenated: true,
48            track_space_decisions: false,
49        }
50    }
51}
52
53/// Extracted text with position information
54#[derive(Debug, Clone)]
55pub struct ExtractedText {
56    /// The extracted text content
57    pub text: String,
58    /// Text fragments with position information (if preserve_layout is true)
59    pub fragments: Vec<TextFragment>,
60}
61
62/// Metadata about a space insertion decision during text extraction.
63/// Only populated when [`ExtractionOptions::track_space_decisions`] is `true`.
64#[derive(Debug, Clone)]
65pub struct SpaceDecision {
66    /// Character offset in the extracted text.
67    pub offset: usize,
68    /// Actual horizontal gap (dx) in text space units.
69    pub dx: f64,
70    /// The threshold used at this point.
71    pub threshold: f64,
72    /// Confidence: `|dx - threshold| / threshold`, clamped to [0.0, 1.0].
73    pub confidence: f64,
74    /// Whether a space was inserted.
75    pub inserted: bool,
76}
77
78/// A fragment of text with position information
79#[derive(Debug, Clone)]
80pub struct TextFragment {
81    /// Text content
82    pub text: String,
83    /// X position in page coordinates
84    pub x: f64,
85    /// Y position in page coordinates
86    pub y: f64,
87    /// Width of the text
88    pub width: f64,
89    /// Height of the text
90    pub height: f64,
91    /// Font size
92    pub font_size: f64,
93    /// Font name (if known) - used for kerning-aware text spacing
94    pub font_name: Option<String>,
95    /// Whether the font is bold (detected from font name)
96    pub is_bold: bool,
97    /// Whether the font is italic (detected from font name)
98    pub is_italic: bool,
99    /// Fill color of the text (from graphics state)
100    pub color: Option<Color>,
101    /// Space insertion decisions (empty unless `track_space_decisions` is true).
102    pub space_decisions: Vec<SpaceDecision>,
103}
104
105/// Text extraction state
106struct TextState {
107    /// Current text matrix
108    text_matrix: [f64; 6],
109    /// Current text line matrix
110    text_line_matrix: [f64; 6],
111    /// Current transformation matrix (CTM)
112    ctm: [f64; 6],
113    /// Text leading (line spacing)
114    leading: f64,
115    /// Character spacing
116    char_space: f64,
117    /// Word spacing
118    word_space: f64,
119    /// Horizontal scaling
120    horizontal_scale: f64,
121    /// Text rise
122    text_rise: f64,
123    /// Current font size
124    font_size: f64,
125    /// Current font name
126    font_name: Option<String>,
127    /// Render mode (0 = fill, 1 = stroke, etc.)
128    render_mode: u8,
129    /// Fill color (for text rendering)
130    fill_color: Option<Color>,
131}
132
133impl Default for TextState {
134    fn default() -> Self {
135        Self {
136            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
137            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
138            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139            leading: 0.0,
140            char_space: 0.0,
141            word_space: 0.0,
142            horizontal_scale: 100.0,
143            text_rise: 0.0,
144            font_size: 0.0,
145            font_name: None,
146            render_mode: 0,
147            fill_color: None,
148        }
149    }
150}
151
152/// Parse font style (bold/italic) from font name
153///
154/// Detects bold and italic styles from common font naming patterns.
155/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
156/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
157///
158/// # Examples
159///
160/// ```
161/// use oxidize_pdf::text::extraction::parse_font_style;
162///
163/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
164/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
165/// assert_eq!(parse_font_style("Courier"), (false, false));
166/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
167/// ```
168///
169/// # Returns
170///
171/// Tuple of (is_bold, is_italic)
172pub fn parse_font_style(font_name: &str) -> (bool, bool) {
173    let name_lower = font_name.to_lowercase();
174
175    // Detect bold from common patterns
176    let is_bold = name_lower.contains("bold")
177        || name_lower.contains("-b")
178        || name_lower.contains(" b ")
179        || name_lower.ends_with(" b");
180
181    // Detect italic/oblique from common patterns
182    let is_italic = name_lower.contains("italic")
183        || name_lower.contains("oblique")
184        || name_lower.contains("-i")
185        || name_lower.contains(" i ")
186        || name_lower.ends_with(" i");
187
188    (is_bold, is_italic)
189}
190
191/// Text extractor for PDF pages with CMap support
192pub struct TextExtractor {
193    options: ExtractionOptions,
194    /// Font cache for the current page (name-keyed, rebuilt per page since names are page-local)
195    font_cache: HashMap<String, FontInfo>,
196    /// Persistent font cache keyed by PDF object reference — avoids re-parsing the same font
197    /// object across pages. Most multi-page PDFs reuse the same font objects.
198    font_object_cache: HashMap<(u32, u16), FontInfo>,
199}
200
201impl TextExtractor {
202    /// Create a new text extractor with default options
203    pub fn new() -> Self {
204        Self {
205            options: ExtractionOptions::default(),
206            font_cache: HashMap::new(),
207            font_object_cache: HashMap::new(),
208        }
209    }
210
211    /// Create a text extractor with custom options
212    pub fn with_options(options: ExtractionOptions) -> Self {
213        Self {
214            options,
215            font_cache: HashMap::new(),
216            font_object_cache: HashMap::new(),
217        }
218    }
219
220    /// Extract text from a PDF document
221    pub fn extract_from_document<R: Read + Seek>(
222        &mut self,
223        document: &PdfDocument<R>,
224    ) -> ParseResult<Vec<ExtractedText>> {
225        let page_count = document.page_count()?;
226        let mut results = Vec::new();
227
228        for i in 0..page_count {
229            let text = self.extract_from_page(document, i)?;
230            results.push(text);
231        }
232
233        Ok(results)
234    }
235
236    /// Extract text from a specific page
237    pub fn extract_from_page<R: Read + Seek>(
238        &mut self,
239        document: &PdfDocument<R>,
240        page_index: u32,
241    ) -> ParseResult<ExtractedText> {
242        // Get the page
243        let page = document.get_page(page_index)?;
244
245        // Extract font resources first
246        {
247            let _span = tracing::info_span!("font_resources").entered();
248            self.extract_font_resources(&page, document)?;
249        }
250
251        // Get content streams
252        let streams = {
253            let _span = tracing::info_span!("stream_decompress").entered();
254            page.content_streams_with_document(document)?
255        };
256
257        let mut extracted_text = String::new();
258        let mut fragments = Vec::new();
259        let mut state = TextState::default();
260        let mut in_text_object = false;
261        let mut last_x = 0.0;
262        let mut last_y = 0.0;
263
264        // Process each content stream
265        for (stream_idx, stream_data) in streams.iter().enumerate() {
266            let operations = match {
267                let _span = tracing::info_span!("content_parse").entered();
268                ContentParser::parse_content(stream_data)
269            } {
270                Ok(ops) => ops,
271                Err(e) => {
272                    // Enhanced diagnostic logging for content stream parsing failures
273                    tracing::debug!(
274                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
275                        page_index + 1,
276                        stream_idx + 1,
277                        streams.len()
278                    );
279                    tracing::debug!("         Error: {}", e);
280                    tracing::debug!("         Stream size: {} bytes", stream_data.len());
281
282                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
283                    let preview_len = stream_data.len().min(100);
284                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
285                    tracing::debug!(
286                        "         Stream preview (first {} bytes): {:?}",
287                        preview_len,
288                        preview.chars().take(80).collect::<String>()
289                    );
290
291                    // Continue processing other streams
292                    continue;
293                }
294            };
295
296            let _ops_span = tracing::info_span!("text_ops_loop").entered();
297            for op in operations {
298                match op {
299                    ContentOperation::BeginText => {
300                        in_text_object = true;
301                        // Reset text matrix to identity
302                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
303                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
304                    }
305
306                    ContentOperation::EndText => {
307                        in_text_object = false;
308                    }
309
310                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
311                        state.text_matrix =
312                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
313                        state.text_line_matrix =
314                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
315                    }
316
317                    ContentOperation::MoveText(tx, ty) => {
318                        // Update text matrix by translation
319                        let new_matrix = multiply_matrix(
320                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
321                            &state.text_line_matrix,
322                        );
323                        state.text_matrix = new_matrix;
324                        state.text_line_matrix = new_matrix;
325                    }
326
327                    ContentOperation::NextLine => {
328                        // Move to next line using current leading
329                        let new_matrix = multiply_matrix(
330                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
331                            &state.text_line_matrix,
332                        );
333                        state.text_matrix = new_matrix;
334                        state.text_line_matrix = new_matrix;
335                    }
336
337                    ContentOperation::ShowText(text) => {
338                        if in_text_object {
339                            let text_bytes = &text;
340                            let decoded = self.decode_text(text_bytes, &state)?;
341
342                            // Pen origin in user space = (CTM × text_matrix)(0, 0).
343                            let (x, y) = text_origin(&state);
344
345                            // Add spacing based on position change
346                            if !extracted_text.is_empty() {
347                                let dx = x - last_x;
348                                let dy = (y - last_y).abs();
349
350                                if dy > self.options.newline_threshold {
351                                    extracted_text.push('\n');
352                                } else if dx > self.options.space_threshold * state.font_size {
353                                    extracted_text.push(' ');
354                                }
355                            }
356
357                            extracted_text.push_str(&decoded);
358
359                            // Get font info for accurate width calculation
360                            let font_info = state
361                                .font_name
362                                .as_ref()
363                                .and_then(|name| self.font_cache.get(name));
364
365                            // Calculate width once and reuse
366                            let text_width =
367                                calculate_text_width(&decoded, state.font_size, font_info);
368
369                            if self.options.preserve_layout {
370                                emit_text_fragment(
371                                    &mut fragments,
372                                    &decoded,
373                                    text_width,
374                                    x,
375                                    y,
376                                    &state,
377                                );
378                            }
379
380                            // Update position for next text
381                            last_x = x + text_width;
382                            last_y = y;
383
384                            // Update text matrix for next show operation
385                            let tx = text_width * state.horizontal_scale / 100.0;
386                            state.text_matrix =
387                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
388                        }
389                    }
390
391                    ContentOperation::ShowTextArray(array) => {
392                        if in_text_object {
393                            // Get font info for accurate width calculation
394                            let font_info = state
395                                .font_name
396                                .as_ref()
397                                .and_then(|name| self.font_cache.get(name));
398
399                            for item in array {
400                                match item {
401                                    TextElement::Text(text_bytes) => {
402                                        let decoded = self.decode_text(&text_bytes, &state)?;
403                                        extracted_text.push_str(&decoded);
404
405                                        let text_width = calculate_text_width(
406                                            &decoded,
407                                            state.font_size,
408                                            font_info,
409                                        );
410
411                                        if self.options.preserve_layout {
412                                            let (x, y) = text_origin(&state);
413                                            emit_text_fragment(
414                                                &mut fragments,
415                                                &decoded,
416                                                text_width,
417                                                x,
418                                                y,
419                                                &state,
420                                            );
421                                        }
422
423                                        let tx = text_width * state.horizontal_scale / 100.0;
424                                        state.text_matrix = multiply_matrix(
425                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
426                                            &state.text_matrix,
427                                        );
428                                    }
429                                    TextElement::Spacing(adjustment) => {
430                                        // Text position adjustment (negative = move left)
431                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
432                                        state.text_matrix = multiply_matrix(
433                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
434                                            &state.text_matrix,
435                                        );
436                                    }
437                                }
438                            }
439                        }
440                    }
441
442                    ContentOperation::NextLineShowText(text) => {
443                        if in_text_object {
444                            // ' = T* then Tj string. Advance line matrix by -leading.
445                            let new_matrix = multiply_matrix(
446                                &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
447                                &state.text_line_matrix,
448                            );
449                            state.text_matrix = new_matrix;
450                            state.text_line_matrix = new_matrix;
451
452                            let decoded = self.decode_text(&text, &state)?;
453                            let (x, y) = text_origin(&state);
454
455                            if !extracted_text.is_empty() {
456                                extracted_text.push('\n');
457                            }
458                            extracted_text.push_str(&decoded);
459
460                            let font_info = state
461                                .font_name
462                                .as_ref()
463                                .and_then(|name| self.font_cache.get(name));
464                            let text_width =
465                                calculate_text_width(&decoded, state.font_size, font_info);
466
467                            if self.options.preserve_layout {
468                                emit_text_fragment(
469                                    &mut fragments,
470                                    &decoded,
471                                    text_width,
472                                    x,
473                                    y,
474                                    &state,
475                                );
476                            }
477
478                            last_x = x + text_width;
479                            last_y = y;
480
481                            let tx = text_width * state.horizontal_scale / 100.0;
482                            state.text_matrix =
483                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
484                        }
485                    }
486
487                    ContentOperation::SetSpacingNextLineShowText(word_space, char_space, text) => {
488                        if in_text_object {
489                            // " = aw Tw, ac Tc, then ' string. ISO 32000-1 §9.4.3.
490                            // The variant fields mirror the spec field names:
491                            // (word_spacing, char_spacing, text).
492                            state.word_space = word_space as f64;
493                            state.char_space = char_space as f64;
494
495                            let new_matrix = multiply_matrix(
496                                &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
497                                &state.text_line_matrix,
498                            );
499                            state.text_matrix = new_matrix;
500                            state.text_line_matrix = new_matrix;
501
502                            let decoded = self.decode_text(&text, &state)?;
503                            let (x, y) = text_origin(&state);
504
505                            if !extracted_text.is_empty() {
506                                extracted_text.push('\n');
507                            }
508                            extracted_text.push_str(&decoded);
509
510                            let font_info = state
511                                .font_name
512                                .as_ref()
513                                .and_then(|name| self.font_cache.get(name));
514                            let text_width =
515                                calculate_text_width(&decoded, state.font_size, font_info);
516
517                            if self.options.preserve_layout {
518                                emit_text_fragment(
519                                    &mut fragments,
520                                    &decoded,
521                                    text_width,
522                                    x,
523                                    y,
524                                    &state,
525                                );
526                            }
527
528                            last_x = x + text_width;
529                            last_y = y;
530
531                            let tx = text_width * state.horizontal_scale / 100.0;
532                            state.text_matrix =
533                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
534                        }
535                    }
536
537                    ContentOperation::SetFont(name, size) => {
538                        state.font_name = Some(name);
539                        state.font_size = size as f64;
540                    }
541
542                    ContentOperation::SetLeading(leading) => {
543                        state.leading = leading as f64;
544                    }
545
546                    ContentOperation::SetCharSpacing(spacing) => {
547                        state.char_space = spacing as f64;
548                    }
549
550                    ContentOperation::SetWordSpacing(spacing) => {
551                        state.word_space = spacing as f64;
552                    }
553
554                    ContentOperation::SetHorizontalScaling(scale) => {
555                        state.horizontal_scale = scale as f64;
556                    }
557
558                    ContentOperation::SetTextRise(rise) => {
559                        state.text_rise = rise as f64;
560                    }
561
562                    ContentOperation::SetTextRenderMode(mode) => {
563                        state.render_mode = mode as u8;
564                    }
565
566                    ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
567                        // Update CTM: new_ctm = concat_matrix * current_ctm
568                        let [a0, b0, c0, d0, e0, f0] = state.ctm;
569                        let a = a as f64;
570                        let b = b as f64;
571                        let c = c as f64;
572                        let d = d as f64;
573                        let e = e as f64;
574                        let f = f as f64;
575                        state.ctm = [
576                            a * a0 + b * c0,
577                            a * b0 + b * d0,
578                            c * a0 + d * c0,
579                            c * b0 + d * d0,
580                            e * a0 + f * c0 + e0,
581                            e * b0 + f * d0 + f0,
582                        ];
583                    }
584
585                    // Color operations (Phase 4: Color extraction)
586                    ContentOperation::SetNonStrokingGray(gray) => {
587                        state.fill_color = Some(Color::gray(gray as f64));
588                    }
589
590                    ContentOperation::SetNonStrokingRGB(r, g, b) => {
591                        state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
592                    }
593
594                    ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
595                        state.fill_color =
596                            Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
597                    }
598
599                    _ => {
600                        // Other operations don't affect text extraction
601                    }
602                }
603            }
604        }
605
606        {
607            let _span = tracing::info_span!("layout_finalize").entered();
608
609            // Sort and process fragments if requested
610            if self.options.sort_by_position && !fragments.is_empty() {
611                self.sort_and_merge_fragments(&mut fragments);
612            }
613
614            // Merge close fragments to eliminate spacing artifacts
615            // This is crucial for table detection and structured data extraction
616            if self.options.preserve_layout && !fragments.is_empty() {
617                fragments = self.merge_close_fragments(&fragments);
618            }
619
620            // Reconstruct text from sorted fragments if layout is preserved
621            if self.options.preserve_layout && !fragments.is_empty() {
622                extracted_text = self.reconstruct_text_from_fragments(&fragments);
623            }
624        }
625
626        Ok(ExtractedText {
627            text: extracted_text,
628            fragments,
629        })
630    }
631
632    /// Sort text fragments by position and merge them appropriately
633    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
634        // Sort fragments by Y position (top to bottom) then X position (left to right).
635        //
636        // We quantize Y into bands of `newline_threshold` width so that fragments
637        // on the "same line" get identical Y keys. This ensures the comparator is
638        // a strict total order (transitive), which Rust's sort algorithm requires.
639        // Without quantization, threshold-based "same line" detection breaks
640        // transitivity: A≈B and B≈C does NOT imply A≈C.
641        let threshold = self.options.newline_threshold;
642        fragments.sort_by(|a, b| {
643            // Quantize Y to nearest band (PDF Y increases upward, so negate first)
644            let band_a = if threshold > 0.0 {
645                (-a.y / threshold).round()
646            } else {
647                -a.y
648            };
649            let band_b = if threshold > 0.0 {
650                (-b.y / threshold).round()
651            } else {
652                -b.y
653            };
654
655            // Compare by Y band (top to bottom), then by X within same band
656            band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
657        });
658
659        // Detect columns if requested
660        if self.options.detect_columns {
661            self.detect_and_sort_columns(fragments);
662        }
663    }
664
665    /// Detect columns and re-sort fragments accordingly
666    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
667        // Group fragments by approximate Y position
668        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
669        let mut current_line: Vec<&mut TextFragment> = Vec::new();
670        let mut last_y = f64::INFINITY;
671
672        for fragment in fragments.iter_mut() {
673            let fragment_y = fragment.y;
674            if (last_y - fragment_y).abs() > self.options.newline_threshold
675                && !current_line.is_empty()
676            {
677                lines.push(current_line);
678                current_line = Vec::new();
679            }
680            current_line.push(fragment);
681            last_y = fragment_y;
682        }
683        if !current_line.is_empty() {
684            lines.push(current_line);
685        }
686
687        // Detect column boundaries
688        let mut column_boundaries = vec![0.0];
689        for line in &lines {
690            if line.len() > 1 {
691                for i in 0..line.len() - 1 {
692                    let gap = line[i + 1].x - (line[i].x + line[i].width);
693                    if gap > self.options.column_threshold {
694                        let boundary = line[i].x + line[i].width + gap / 2.0;
695                        if !column_boundaries
696                            .iter()
697                            .any(|&b| (b - boundary).abs() < 10.0)
698                        {
699                            column_boundaries.push(boundary);
700                        }
701                    }
702                }
703            }
704        }
705        column_boundaries.sort_by(|a, b| a.total_cmp(b));
706
707        // Re-sort fragments by column then Y position
708        if column_boundaries.len() > 1 {
709            fragments.sort_by(|a, b| {
710                // Determine column for each fragment
711                let col_a = column_boundaries
712                    .iter()
713                    .position(|&boundary| a.x < boundary)
714                    .unwrap_or(column_boundaries.len())
715                    - 1;
716                let col_b = column_boundaries
717                    .iter()
718                    .position(|&boundary| b.x < boundary)
719                    .unwrap_or(column_boundaries.len())
720                    - 1;
721
722                if col_a != col_b {
723                    col_a.cmp(&col_b)
724                } else {
725                    // Same column, sort by Y position
726                    b.y.total_cmp(&a.y)
727                }
728            });
729        }
730    }
731
732    /// Reconstruct text from sorted fragments
733    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
734        // First, merge consecutive fragments that are very close together
735        let merged_fragments = self.merge_close_fragments(fragments);
736
737        let mut result = String::new();
738        let mut last_y = f64::INFINITY;
739        let mut last_x = 0.0;
740        let mut last_line_ended_with_hyphen = false;
741
742        for fragment in &merged_fragments {
743            // Check if we need a newline
744            let y_diff = (last_y - fragment.y).abs();
745            if !result.is_empty() && y_diff > self.options.newline_threshold {
746                // Handle hyphenation
747                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
748                    // Remove the hyphen and don't add newline
749                    if result.ends_with('-') {
750                        result.pop();
751                    }
752                } else {
753                    result.push('\n');
754                }
755            } else if !result.is_empty() {
756                // Check if we need a space
757                let x_gap = fragment.x - last_x;
758                if x_gap > self.options.space_threshold * fragment.font_size {
759                    result.push(' ');
760                }
761            }
762
763            result.push_str(&fragment.text);
764            last_line_ended_with_hyphen = fragment.text.ends_with('-');
765            last_y = fragment.y;
766            last_x = fragment.x + fragment.width;
767        }
768
769        result
770    }
771
772    /// Merge fragments that are very close together on the same line
773    /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
774    fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
775        if fragments.is_empty() {
776            return Vec::new();
777        }
778
779        let mut merged = Vec::new();
780        let mut current = fragments[0].clone();
781
782        for fragment in &fragments[1..] {
783            // Check if this fragment is on the same line and very close
784            let y_diff = (current.y - fragment.y).abs();
785            let x_gap = fragment.x - (current.x + current.width);
786
787            // Merge if on same line and gap is less than a character width
788            // Use 0.5 * font_size as threshold - this catches most artificial spacing
789            let should_merge = y_diff < 1.0  // Same line (very tight tolerance)
790                && x_gap >= 0.0  // Fragment is to the right
791                && x_gap < fragment.font_size * 0.5; // Gap less than 50% of font size
792
793            if should_merge {
794                // Merge this fragment into current
795                current.text.push_str(&fragment.text);
796                current.width = (fragment.x + fragment.width) - current.x;
797            } else {
798                // Start a new fragment
799                merged.push(current);
800                current = fragment.clone();
801            }
802        }
803
804        merged.push(current);
805        merged
806    }
807
808    /// Extract font resources from page
809    ///
810    /// Clears the per-page name cache (font names are page-local in PDF), but
811    /// reuses previously parsed font objects via `font_object_cache` to avoid
812    /// re-parsing the same font object across multiple pages.
813    fn extract_font_resources<R: Read + Seek>(
814        &mut self,
815        page: &ParsedPage,
816        document: &PdfDocument<R>,
817    ) -> ParseResult<()> {
818        // Clear per-page name mapping (font names like /F1 are page-local)
819        self.font_cache.clear();
820
821        // Try to get resources manually from page dictionary first
822        // This is necessary because ParsedPage.get_resources() may not always work
823        if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
824            if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
825            {
826                if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
827                    for (font_name, font_obj) in font_dict.0.iter() {
828                        if let Some(font_ref) = font_obj.as_reference() {
829                            self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
830                        }
831                    }
832                }
833            }
834        } else if let Some(resources) = page.get_resources() {
835            // Fallback to get_resources() if Resources is not a reference
836            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
837                for (font_name, font_obj) in font_dict.0.iter() {
838                    if let Some(font_ref) = font_obj.as_reference() {
839                        self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
840                    }
841                }
842            }
843        }
844
845        Ok(())
846    }
847
848    /// Cache a font, reusing the persistent object cache when possible.
849    fn cache_font_by_ref<R: Read + Seek>(
850        &mut self,
851        font_name: &str,
852        font_ref: (u32, u16),
853        document: &PdfDocument<R>,
854    ) {
855        // Check persistent object cache first — avoids re-parsing across pages
856        if let Some(cached) = self.font_object_cache.get(&font_ref) {
857            self.font_cache
858                .insert(font_name.to_string(), cached.clone());
859            tracing::debug!(
860                "Reused cached font object ({}, {}): {} (ToUnicode: {})",
861                font_ref.0,
862                font_ref.1,
863                font_name,
864                cached.to_unicode.is_some()
865            );
866            return;
867        }
868
869        // Parse font object
870        if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
871            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
872            if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
873                let has_to_unicode = font_info.to_unicode.is_some();
874                // Store in persistent cache
875                self.font_object_cache.insert(font_ref, font_info.clone());
876                // Store in per-page name cache
877                self.font_cache.insert(font_name.to_string(), font_info);
878                tracing::debug!(
879                    "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
880                    font_ref.0,
881                    font_ref.1,
882                    font_name,
883                    has_to_unicode
884                );
885            }
886        }
887    }
888
889    /// Decode text using the current font encoding and ToUnicode mapping
890    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
891        use crate::text::encoding::TextEncoding;
892
893        // First, try to use cached font information with ToUnicode CMap
894        if let Some(ref font_name) = state.font_name {
895            if let Some(font_info) = self.font_cache.get(font_name) {
896                // Try CMap-based decoding first (free function — no allocation)
897                if let Ok(decoded) =
898                    crate::text::extraction_cmap::decode_text_with_font(text, font_info)
899                {
900                    // Only accept if we got meaningful text (not all null bytes or garbage)
901                    if !decoded.trim().is_empty()
902                        && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
903                    {
904                        // Apply sanitization to remove control characters (Issue #116)
905                        let sanitized = sanitize_extracted_text(&decoded);
906                        tracing::debug!(
907                            "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
908                            font_name,
909                            text,
910                            sanitized
911                        );
912                        return Ok(sanitized);
913                    }
914                }
915
916                tracing::debug!(
917                    "CMap decoding failed or produced garbage for font {}, falling back to encoding",
918                    font_name
919                );
920            }
921        }
922
923        // Fall back to encoding-based decoding
924        let encoding = if let Some(ref font_name) = state.font_name {
925            match font_name.to_lowercase().as_str() {
926                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
927                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
928                name if name.contains("standard") => TextEncoding::StandardEncoding,
929                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
930                _ => {
931                    // Default based on common patterns
932                    if font_name.starts_with("Times")
933                        || font_name.starts_with("Helvetica")
934                        || font_name.starts_with("Courier")
935                    {
936                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
937                    } else {
938                        TextEncoding::PdfDocEncoding // Safe default
939                    }
940                }
941            }
942        } else {
943            TextEncoding::WinAnsiEncoding // Default for most PDFs
944        };
945
946        let fallback_result = encoding.decode(text);
947        // Apply sanitization to remove control characters (Issue #116)
948        let sanitized = sanitize_extracted_text(&fallback_result);
949        tracing::debug!(
950            "Fallback encoding decoding: {:?} -> \"{}\"",
951            text,
952            sanitized
953        );
954        Ok(sanitized)
955    }
956}
957
958impl Default for TextExtractor {
959    fn default() -> Self {
960        Self::new()
961    }
962}
963
964/// Push a `TextFragment` capturing a glyph run about to be shown.
965///
966/// Encapsulates the style-derivation + push sequence shared by every
967/// text-show operator handler in `extract_from_page` (`Tj`, `TJ`, `'`,
968/// `"`). The caller supplies the pen origin `(x, y)` already mapped to
969/// user space (typically via `text_origin(&state)`); doing so avoids the
970/// double `multiply_matrix + transform_point` that prior versions did
971/// (handler computed it for `last_x`/`last_y`, then this fn recomputed
972/// it on the same `state`).
973fn emit_text_fragment(
974    fragments: &mut Vec<TextFragment>,
975    decoded: &str,
976    text_width: f64,
977    x: f64,
978    y: f64,
979    state: &TextState,
980) {
981    if decoded.is_empty() {
982        return;
983    }
984    let (is_bold, is_italic) = state
985        .font_name
986        .as_ref()
987        .map(|name| parse_font_style(name))
988        .unwrap_or((false, false));
989    fragments.push(TextFragment {
990        text: decoded.to_owned(),
991        x,
992        y,
993        width: text_width,
994        height: state.font_size,
995        font_size: state.font_size,
996        font_name: state.font_name.clone(),
997        is_bold,
998        is_italic,
999        color: state.fill_color,
1000        space_decisions: Vec::new(),
1001    });
1002}
1003
1004/// Pen origin (user-space coordinates) of the next glyph in the current
1005/// text state — i.e. `CTM × text_matrix` applied to (0, 0). Centralized
1006/// to eliminate the duplicated `multiply_matrix + transform_point` pair
1007/// across the four `extract_from_page` show-text handlers.
1008fn text_origin(state: &TextState) -> (f64, f64) {
1009    let combined = multiply_matrix(&state.ctm, &state.text_matrix);
1010    transform_point(0.0, 0.0, &combined)
1011}
1012
1013/// Multiply two transformation matrices
1014fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
1015    [
1016        a[0] * b[0] + a[1] * b[2],
1017        a[0] * b[1] + a[1] * b[3],
1018        a[2] * b[0] + a[3] * b[2],
1019        a[2] * b[1] + a[3] * b[3],
1020        a[4] * b[0] + a[5] * b[2] + b[4],
1021        a[4] * b[1] + a[5] * b[3] + b[5],
1022    ]
1023}
1024
1025/// Transform a point using a transformation matrix
1026fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
1027    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
1028    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
1029    (tx, ty)
1030}
1031
1032/// Calculate text width using actual font metrics (including kerning)
1033fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
1034    // If we have font metrics, use them for accurate width calculation
1035    if let Some(font) = font_info {
1036        if let Some(ref widths) = font.metrics.widths {
1037            let first_char = font.metrics.first_char.unwrap_or(0);
1038            let last_char = font.metrics.last_char.unwrap_or(255);
1039            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
1040
1041            let mut total_width = 0.0;
1042            let mut chars = text.chars().peekable();
1043
1044            while let Some(ch) = chars.next() {
1045                let char_code = ch as u32;
1046
1047                // Get width from Widths array or use missing_width
1048                let width = if char_code >= first_char && char_code <= last_char {
1049                    let index = (char_code - first_char) as usize;
1050                    widths.get(index).copied().unwrap_or(missing_width)
1051                } else {
1052                    missing_width
1053                };
1054
1055                // Convert from glyph space (1/1000 units) to user space
1056                total_width += width / 1000.0 * font_size;
1057
1058                // Apply kerning if available (for character pairs)
1059                if let Some(ref kerning) = font.metrics.kerning {
1060                    if let Some(&next_ch) = chars.peek() {
1061                        let next_char = next_ch as u32;
1062                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
1063                            // Kerning is in FUnits (1/1000), convert to user space
1064                            total_width += kern_value / 1000.0 * font_size;
1065                        }
1066                    }
1067                }
1068            }
1069
1070            return total_width;
1071        }
1072    }
1073
1074    // Fallback to simplified calculation if no metrics available
1075    text.len() as f64 * font_size * 0.5
1076}
1077
1078/// Sanitize extracted text by removing or replacing control characters.
1079///
1080/// This function addresses Issue #116 where extracted text contains NUL bytes (`\0`)
1081/// and ETX characters (`\u{3}`) where spaces should appear.
1082///
1083/// # Behavior
1084///
1085/// - Replaces `\0\u{3}` sequences with a single space (common word separator pattern)
1086/// - Replaces standalone `\0` (NUL) with space
1087/// - Removes other ASCII control characters (0x01-0x1F) except:
1088///   - `\t` (0x09) - Tab
1089///   - `\n` (0x0A) - Line feed
1090///   - `\r` (0x0D) - Carriage return
1091/// - Collapses multiple consecutive spaces into a single space
1092///
1093/// # Examples
1094///
1095/// ```
1096/// use oxidize_pdf::text::extraction::sanitize_extracted_text;
1097///
1098/// // Issue #116 pattern: NUL+ETX as word separator
1099/// let dirty = "a\0\u{3}sergeant\0\u{3}and";
1100/// assert_eq!(sanitize_extracted_text(dirty), "a sergeant and");
1101///
1102/// // Standalone NUL becomes space
1103/// let with_nul = "word\0another";
1104/// assert_eq!(sanitize_extracted_text(with_nul), "word another");
1105///
1106/// // Clean text passes through unchanged
1107/// let clean = "Normal text";
1108/// assert_eq!(sanitize_extracted_text(clean), "Normal text");
1109/// ```
1110pub fn sanitize_extracted_text(text: &str) -> String {
1111    if text.is_empty() {
1112        return String::new();
1113    }
1114
1115    // Pre-allocate with same capacity (result will be <= input length)
1116    let mut result = String::with_capacity(text.len());
1117    let mut chars = text.chars().peekable();
1118    let mut last_was_space = false;
1119
1120    while let Some(ch) = chars.next() {
1121        match ch {
1122            // NUL byte - check if followed by ETX for the \0\u{3} pattern
1123            '\0' => {
1124                // Peek at next char to detect \0\u{3} sequence
1125                if chars.peek() == Some(&'\u{3}') {
1126                    chars.next(); // consume the ETX
1127                }
1128                // In both cases (standalone NUL or NUL+ETX), emit space
1129                if !last_was_space {
1130                    result.push(' ');
1131                    last_was_space = true;
1132                }
1133            }
1134
1135            // ETX alone (not preceded by NUL) - remove it
1136            '\u{3}' => {
1137                // Don't emit anything, just skip
1138            }
1139
1140            // Preserve allowed whitespace
1141            '\t' | '\n' | '\r' => {
1142                result.push(ch);
1143                // Reset space tracking on newlines but not tabs
1144                last_was_space = ch == '\t';
1145            }
1146
1147            // Regular space - collapse multiples
1148            ' ' => {
1149                if !last_was_space {
1150                    result.push(' ');
1151                    last_was_space = true;
1152                }
1153            }
1154
1155            // Other control characters (0x01-0x1F except tab/newline/CR) - remove
1156            c if c.is_ascii_control() => {
1157                // Skip control characters
1158            }
1159
1160            // Normal characters - keep them
1161            _ => {
1162                result.push(ch);
1163                last_was_space = false;
1164            }
1165        }
1166    }
1167
1168    result
1169}
1170
1171#[cfg(test)]
1172mod tests {
1173    use super::*;
1174
1175    #[test]
1176    fn test_matrix_multiplication() {
1177        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1178        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1179
1180        let result = multiply_matrix(&identity, &translation);
1181        assert_eq!(result, translation);
1182
1183        let result2 = multiply_matrix(&translation, &identity);
1184        assert_eq!(result2, translation);
1185    }
1186
1187    #[test]
1188    fn test_transform_point() {
1189        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1190        let (x, y) = transform_point(5.0, 5.0, &translation);
1191        assert_eq!(x, 15.0);
1192        assert_eq!(y, 25.0);
1193    }
1194
1195    #[test]
1196    fn test_extraction_options_default() {
1197        let options = ExtractionOptions::default();
1198        assert!(!options.preserve_layout);
1199        assert_eq!(options.space_threshold, 0.3);
1200        assert_eq!(options.newline_threshold, 10.0);
1201        assert!(options.sort_by_position);
1202        assert!(!options.detect_columns);
1203        assert_eq!(options.column_threshold, 50.0);
1204        assert!(options.merge_hyphenated);
1205    }
1206
1207    #[test]
1208    fn test_extraction_options_custom() {
1209        let options = ExtractionOptions {
1210            preserve_layout: true,
1211            space_threshold: 0.5,
1212            newline_threshold: 15.0,
1213            sort_by_position: false,
1214            detect_columns: true,
1215            column_threshold: 75.0,
1216            merge_hyphenated: false,
1217            track_space_decisions: false,
1218        };
1219        assert!(options.preserve_layout);
1220        assert_eq!(options.space_threshold, 0.5);
1221        assert_eq!(options.newline_threshold, 15.0);
1222        assert!(!options.sort_by_position);
1223        assert!(options.detect_columns);
1224        assert_eq!(options.column_threshold, 75.0);
1225        assert!(!options.merge_hyphenated);
1226    }
1227
1228    #[test]
1229    fn test_parse_font_style_bold() {
1230        // PostScript style
1231        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1232        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1233
1234        // TrueType style
1235        assert_eq!(parse_font_style("Arial Bold"), (true, false));
1236        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1237
1238        // Short form
1239        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1240    }
1241
1242    #[test]
1243    fn test_parse_font_style_italic() {
1244        // PostScript style
1245        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1246        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1247
1248        // TrueType style
1249        assert_eq!(parse_font_style("Arial Italic"), (false, true));
1250        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1251
1252        // Short form
1253        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1254    }
1255
1256    #[test]
1257    fn test_parse_font_style_bold_italic() {
1258        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1259        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1260        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1261    }
1262
1263    #[test]
1264    fn test_parse_font_style_regular() {
1265        assert_eq!(parse_font_style("Helvetica"), (false, false));
1266        assert_eq!(parse_font_style("Times-Roman"), (false, false));
1267        assert_eq!(parse_font_style("Courier"), (false, false));
1268        assert_eq!(parse_font_style("Arial"), (false, false));
1269    }
1270
1271    #[test]
1272    fn test_parse_font_style_edge_cases() {
1273        // Empty and unusual cases
1274        assert_eq!(parse_font_style(""), (false, false));
1275        assert_eq!(parse_font_style("UnknownFont"), (false, false));
1276
1277        // Case insensitive
1278        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1279        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1280    }
1281
1282    #[test]
1283    fn test_text_fragment() {
1284        let fragment = TextFragment {
1285            text: "Hello".to_string(),
1286            x: 100.0,
1287            y: 200.0,
1288            width: 50.0,
1289            height: 12.0,
1290            font_size: 10.0,
1291            font_name: None,
1292            is_bold: false,
1293            is_italic: false,
1294            color: None,
1295            space_decisions: Vec::new(),
1296        };
1297        assert_eq!(fragment.text, "Hello");
1298        assert_eq!(fragment.x, 100.0);
1299        assert_eq!(fragment.y, 200.0);
1300        assert_eq!(fragment.width, 50.0);
1301        assert_eq!(fragment.height, 12.0);
1302        assert_eq!(fragment.font_size, 10.0);
1303    }
1304
1305    #[test]
1306    fn test_extracted_text() {
1307        let fragments = vec![
1308            TextFragment {
1309                text: "Hello".to_string(),
1310                x: 100.0,
1311                y: 200.0,
1312                width: 50.0,
1313                height: 12.0,
1314                font_size: 10.0,
1315                font_name: None,
1316                is_bold: false,
1317                is_italic: false,
1318                color: None,
1319                space_decisions: Vec::new(),
1320            },
1321            TextFragment {
1322                text: "World".to_string(),
1323                x: 160.0,
1324                y: 200.0,
1325                width: 50.0,
1326                height: 12.0,
1327                font_size: 10.0,
1328                font_name: None,
1329                is_bold: false,
1330                is_italic: false,
1331                color: None,
1332                space_decisions: Vec::new(),
1333            },
1334        ];
1335
1336        let extracted = ExtractedText {
1337            text: "Hello World".to_string(),
1338            fragments: fragments,
1339        };
1340
1341        assert_eq!(extracted.text, "Hello World");
1342        assert_eq!(extracted.fragments.len(), 2);
1343        assert_eq!(extracted.fragments[0].text, "Hello");
1344        assert_eq!(extracted.fragments[1].text, "World");
1345    }
1346
1347    #[test]
1348    fn test_text_state_default() {
1349        let state = TextState::default();
1350        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1351        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1352        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1353        assert_eq!(state.leading, 0.0);
1354        assert_eq!(state.char_space, 0.0);
1355        assert_eq!(state.word_space, 0.0);
1356        assert_eq!(state.horizontal_scale, 100.0);
1357        assert_eq!(state.text_rise, 0.0);
1358        assert_eq!(state.font_size, 0.0);
1359        assert!(state.font_name.is_none());
1360        assert_eq!(state.render_mode, 0);
1361    }
1362
1363    #[test]
1364    fn test_matrix_operations() {
1365        // Test rotation matrix
1366        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
1367        let (x, y) = transform_point(1.0, 0.0, &rotation);
1368        assert_eq!(x, 0.0);
1369        assert_eq!(y, 1.0);
1370
1371        // Test scaling matrix
1372        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1373        let (x, y) = transform_point(5.0, 5.0, &scale);
1374        assert_eq!(x, 10.0);
1375        assert_eq!(y, 15.0);
1376
1377        // Test complex transformation
1378        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1379        let (x, y) = transform_point(1.0, 1.0, &complex);
1380        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
1381        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
1382    }
1383
1384    #[test]
1385    fn test_text_extractor_new() {
1386        let extractor = TextExtractor::new();
1387        let options = extractor.options;
1388        assert!(!options.preserve_layout);
1389        assert_eq!(options.space_threshold, 0.3);
1390        assert_eq!(options.newline_threshold, 10.0);
1391        assert!(options.sort_by_position);
1392        assert!(!options.detect_columns);
1393        assert_eq!(options.column_threshold, 50.0);
1394        assert!(options.merge_hyphenated);
1395    }
1396
1397    #[test]
1398    fn test_text_extractor_with_options() {
1399        let options = ExtractionOptions {
1400            preserve_layout: true,
1401            space_threshold: 0.3,
1402            newline_threshold: 12.0,
1403            sort_by_position: false,
1404            detect_columns: true,
1405            column_threshold: 60.0,
1406            merge_hyphenated: false,
1407            track_space_decisions: false,
1408        };
1409        let extractor = TextExtractor::with_options(options.clone());
1410        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1411        assert_eq!(extractor.options.space_threshold, options.space_threshold);
1412        assert_eq!(
1413            extractor.options.newline_threshold,
1414            options.newline_threshold
1415        );
1416        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1417        assert_eq!(extractor.options.detect_columns, options.detect_columns);
1418        assert_eq!(extractor.options.column_threshold, options.column_threshold);
1419        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1420    }
1421
1422    // =========================================================================
1423    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
1424    // =========================================================================
1425
1426    #[test]
1427    fn test_calculate_text_width_with_no_font_info() {
1428        // Test fallback: should use simplified calculation
1429        let width = calculate_text_width("Hello", 12.0, None);
1430
1431        // Expected: 5 chars * 12.0 * 0.5 = 30.0
1432        assert_eq!(
1433            width, 30.0,
1434            "Without font info, should use simplified calculation: len * font_size * 0.5"
1435        );
1436    }
1437
1438    #[test]
1439    fn test_calculate_text_width_with_empty_metrics() {
1440        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1441
1442        // Font with no widths array
1443        let font_info = FontInfo {
1444            name: "TestFont".to_string(),
1445            font_type: "Type1".to_string(),
1446            encoding: None,
1447            to_unicode: None,
1448            differences: None,
1449            descendant_font: None,
1450            cid_to_gid_map: None,
1451            cid_ordering: None,
1452            metrics: FontMetrics {
1453                first_char: None,
1454                last_char: None,
1455                widths: None,
1456                missing_width: Some(500.0),
1457                kerning: None,
1458            },
1459        };
1460
1461        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1462
1463        // Should fall back to simplified calculation
1464        assert_eq!(
1465            width, 30.0,
1466            "Without widths array, should fall back to simplified calculation"
1467        );
1468    }
1469
1470    #[test]
1471    fn test_calculate_text_width_with_complete_metrics() {
1472        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1473
1474        // Font with complete metrics for ASCII range 32-126
1475        // Simulate typical Helvetica widths (in 1/1000 units)
1476        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
1477
1478        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
1479        widths[72 - 32] = 722.0; // 'H' is ASCII 72
1480        widths[101 - 32] = 556.0; // 'e' is ASCII 101
1481        widths[108 - 32] = 278.0; // 'l' is ASCII 108
1482        widths[111 - 32] = 611.0; // 'o' is ASCII 111
1483
1484        let font_info = FontInfo {
1485            name: "Helvetica".to_string(),
1486            font_type: "Type1".to_string(),
1487            encoding: None,
1488            to_unicode: None,
1489            differences: None,
1490            descendant_font: None,
1491            cid_to_gid_map: None,
1492            cid_ordering: None,
1493            metrics: FontMetrics {
1494                first_char: Some(32),
1495                last_char: Some(126),
1496                widths: Some(widths),
1497                missing_width: Some(500.0),
1498                kerning: None,
1499            },
1500        };
1501
1502        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1503
1504        // Expected calculation (widths in glyph space / 1000 * font_size):
1505        // H: 722/1000 * 12 = 8.664
1506        // e: 556/1000 * 12 = 6.672
1507        // l: 278/1000 * 12 = 3.336
1508        // l: 278/1000 * 12 = 3.336
1509        // o: 611/1000 * 12 = 7.332
1510        // Total: 29.34
1511        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1512        let tolerance = 0.0001; // Floating point tolerance
1513        assert!(
1514            (width - expected).abs() < tolerance,
1515            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1516            expected,
1517            width,
1518            (width - expected).abs()
1519        );
1520
1521        // Verify it's different from simplified calculation
1522        let simplified = 5.0 * 12.0 * 0.5; // 30.0
1523        assert_ne!(
1524            width, simplified,
1525            "Metrics-based calculation should differ from simplified (30.0)"
1526        );
1527    }
1528
1529    #[test]
1530    fn test_calculate_text_width_character_outside_range() {
1531        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1532
1533        // Font with narrow range (only covers 'A'-'Z')
1534        let widths = vec![722.0; 26]; // All uppercase letters same width
1535
1536        let font_info = FontInfo {
1537            name: "TestFont".to_string(),
1538            font_type: "Type1".to_string(),
1539            encoding: None,
1540            to_unicode: None,
1541            differences: None,
1542            descendant_font: None,
1543            cid_to_gid_map: None,
1544            cid_ordering: None,
1545            metrics: FontMetrics {
1546                first_char: Some(65), // 'A'
1547                last_char: Some(90),  // 'Z'
1548                widths: Some(widths),
1549                missing_width: Some(500.0),
1550                kerning: None,
1551            },
1552        };
1553
1554        // Test with character outside range
1555        let width = calculate_text_width("A1", 10.0, Some(&font_info));
1556
1557        // Expected:
1558        // 'A' (65) is in range: 722/1000 * 10 = 7.22
1559        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
1560        // Total: 12.22
1561        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1562        assert_eq!(
1563            width, expected,
1564            "Should use missing_width for characters outside range"
1565        );
1566    }
1567
1568    #[test]
1569    fn test_calculate_text_width_missing_width_in_array() {
1570        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1571
1572        // Font with incomplete widths array (some characters have 0.0)
1573        let mut widths = vec![500.0; 95]; // Default width
1574        widths[10] = 0.0; // Character at index 10 has no width defined
1575
1576        let font_info = FontInfo {
1577            name: "TestFont".to_string(),
1578            font_type: "Type1".to_string(),
1579            encoding: None,
1580            to_unicode: None,
1581            differences: None,
1582            descendant_font: None,
1583            cid_to_gid_map: None,
1584            cid_ordering: None,
1585            metrics: FontMetrics {
1586                first_char: Some(32),
1587                last_char: Some(126),
1588                widths: Some(widths),
1589                missing_width: Some(600.0),
1590                kerning: None,
1591            },
1592        };
1593
1594        // Character 42 (index 10 from first_char 32)
1595        let char_code = 42u8 as char; // '*'
1596        let text = char_code.to_string();
1597        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1598
1599        // Character is in range but width is 0.0, should NOT fall back to missing_width
1600        // (0.0 is a valid width for zero-width characters)
1601        assert_eq!(
1602            width, 0.0,
1603            "Should use 0.0 width from array, not missing_width"
1604        );
1605    }
1606
1607    #[test]
1608    fn test_calculate_text_width_empty_string() {
1609        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1610
1611        let font_info = FontInfo {
1612            name: "TestFont".to_string(),
1613            font_type: "Type1".to_string(),
1614            encoding: None,
1615            to_unicode: None,
1616            differences: None,
1617            descendant_font: None,
1618            cid_to_gid_map: None,
1619            cid_ordering: None,
1620            metrics: FontMetrics {
1621                first_char: Some(32),
1622                last_char: Some(126),
1623                widths: Some(vec![500.0; 95]),
1624                missing_width: Some(500.0),
1625                kerning: None,
1626            },
1627        };
1628
1629        let width = calculate_text_width("", 12.0, Some(&font_info));
1630        assert_eq!(width, 0.0, "Empty string should have zero width");
1631
1632        // Also test without font info
1633        let width_no_font = calculate_text_width("", 12.0, None);
1634        assert_eq!(
1635            width_no_font, 0.0,
1636            "Empty string should have zero width (no font)"
1637        );
1638    }
1639
1640    #[test]
1641    fn test_calculate_text_width_unicode_characters() {
1642        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1643
1644        // Font with limited ASCII range
1645        let font_info = FontInfo {
1646            name: "TestFont".to_string(),
1647            font_type: "Type1".to_string(),
1648            encoding: None,
1649            to_unicode: None,
1650            differences: None,
1651            descendant_font: None,
1652            cid_to_gid_map: None,
1653            cid_ordering: None,
1654            metrics: FontMetrics {
1655                first_char: Some(32),
1656                last_char: Some(126),
1657                widths: Some(vec![500.0; 95]),
1658                missing_width: Some(600.0),
1659                kerning: None,
1660            },
1661        };
1662
1663        // Test with Unicode characters outside ASCII range
1664        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1665
1666        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1667        // Expected: 600/1000 * 10 = 6.0
1668        assert_eq!(
1669            width, 6.0,
1670            "Unicode character outside range should use missing_width"
1671        );
1672    }
1673
1674    #[test]
1675    fn test_calculate_text_width_different_font_sizes() {
1676        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1677
1678        let font_info = FontInfo {
1679            name: "TestFont".to_string(),
1680            font_type: "Type1".to_string(),
1681            encoding: None,
1682            to_unicode: None,
1683            differences: None,
1684            descendant_font: None,
1685            cid_to_gid_map: None,
1686            cid_ordering: None,
1687            metrics: FontMetrics {
1688                first_char: Some(65), // 'A'
1689                last_char: Some(65),  // 'A'
1690                widths: Some(vec![722.0]),
1691                missing_width: Some(500.0),
1692                kerning: None,
1693            },
1694        };
1695
1696        // Test same character with different font sizes
1697        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1698        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1699
1700        // Widths should scale linearly with font size
1701        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1702        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1703        assert_eq!(
1704            width_20,
1705            width_10 * 2.0,
1706            "Width should scale linearly with font size"
1707        );
1708    }
1709
1710    #[test]
1711    fn test_calculate_text_width_proportional_vs_monospace() {
1712        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1713
1714        // Simulate proportional font (different widths)
1715        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1716        let proportional_font = FontInfo {
1717            name: "Helvetica".to_string(),
1718            font_type: "Type1".to_string(),
1719            encoding: None,
1720            to_unicode: None,
1721            differences: None,
1722            descendant_font: None,
1723            cid_to_gid_map: None,
1724            cid_ordering: None,
1725            metrics: FontMetrics {
1726                first_char: Some(105), // 'i'
1727                last_char: Some(107),  // covers i, j, k
1728                widths: Some(proportional_widths),
1729                missing_width: Some(500.0),
1730                kerning: None,
1731            },
1732        };
1733
1734        // Simulate monospace font (same width)
1735        let monospace_widths = vec![600.0, 600.0, 600.0];
1736        let monospace_font = FontInfo {
1737            name: "Courier".to_string(),
1738            font_type: "Type1".to_string(),
1739            encoding: None,
1740            to_unicode: None,
1741            differences: None,
1742            descendant_font: None,
1743            cid_to_gid_map: None,
1744            cid_ordering: None,
1745            metrics: FontMetrics {
1746                first_char: Some(105),
1747                last_char: Some(107),
1748                widths: Some(monospace_widths),
1749                missing_width: Some(600.0),
1750                kerning: None,
1751            },
1752        };
1753
1754        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1755        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1756
1757        // Proportional 'i' should be narrower than monospace 'i'
1758        assert!(
1759            prop_width < mono_width,
1760            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1761            prop_width,
1762            mono_width
1763        );
1764    }
1765
1766    // =========================================================================
1767    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1768    // =========================================================================
1769
1770    #[test]
1771    fn test_calculate_text_width_with_kerning() {
1772        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1773        use std::collections::HashMap;
1774
1775        // Create a font with kerning pairs
1776        let mut widths = vec![500.0; 95]; // ASCII 32-126
1777        widths[65 - 32] = 722.0; // 'A'
1778        widths[86 - 32] = 722.0; // 'V'
1779        widths[87 - 32] = 944.0; // 'W'
1780
1781        let mut kerning = HashMap::new();
1782        // Typical kerning pairs (in FUnits, 1/1000)
1783        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1784        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1785
1786        let font_info = FontInfo {
1787            name: "Helvetica".to_string(),
1788            font_type: "Type1".to_string(),
1789            encoding: None,
1790            to_unicode: None,
1791            differences: None,
1792            descendant_font: None,
1793            cid_to_gid_map: None,
1794            cid_ordering: None,
1795            metrics: FontMetrics {
1796                first_char: Some(32),
1797                last_char: Some(126),
1798                widths: Some(widths),
1799                missing_width: Some(500.0),
1800                kerning: Some(kerning),
1801            },
1802        };
1803
1804        // Test "AV" with kerning
1805        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1806        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1807        //         = 17.328 - 0.6 = 16.728
1808        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1809        let tolerance = 0.0001;
1810        assert!(
1811            (width_av - expected_av).abs() < tolerance,
1812            "AV with kerning: expected {}, got {}, diff {}",
1813            expected_av,
1814            width_av,
1815            (width_av - expected_av).abs()
1816        );
1817
1818        // Test "AW" with different kerning value
1819        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1820        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1821        //         = 19.992 - 0.48 = 19.512
1822        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1823        assert!(
1824            (width_aw - expected_aw).abs() < tolerance,
1825            "AW with kerning: expected {}, got {}, diff {}",
1826            expected_aw,
1827            width_aw,
1828            (width_aw - expected_aw).abs()
1829        );
1830
1831        // Test "VA" with NO kerning (pair not in HashMap)
1832        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1833        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1834        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1835        assert!(
1836            (width_va - expected_va).abs() < tolerance,
1837            "VA without kerning: expected {}, got {}, diff {}",
1838            expected_va,
1839            width_va,
1840            (width_va - expected_va).abs()
1841        );
1842
1843        // Verify kerning makes a measurable difference
1844        assert!(
1845            width_av < width_va,
1846            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1847            width_av,
1848            width_va
1849        );
1850    }
1851
1852    #[test]
1853    fn test_parse_truetype_kern_table_minimal() {
1854        use crate::text::extraction_cmap::parse_truetype_kern_table;
1855
1856        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1857        // Structure:
1858        // 1. Offset table (12 bytes)
1859        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1860        // 3. 'head' table data (54 bytes)
1861        // 4. 'kern' table data (30 bytes)
1862        // Total: 128 bytes
1863        let mut ttf_data = vec![
1864            // Offset table
1865            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1866            0x00, 0x02, // numTables: 2
1867            0x00, 0x20, // searchRange: 32
1868            0x00, 0x01, // entrySelector: 1
1869            0x00, 0x00, // rangeShift: 0
1870        ];
1871
1872        // Table directory entry 1: 'head' table
1873        ttf_data.extend_from_slice(b"head"); // tag
1874        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1875        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1876        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1877
1878        // Table directory entry 2: 'kern' table
1879        ttf_data.extend_from_slice(b"kern"); // tag
1880        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1881        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1882        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1883
1884        // 'head' table data (54 bytes of zeros - minimal valid head table)
1885        ttf_data.extend_from_slice(&[0u8; 54]);
1886
1887        // 'kern' table data (34 bytes)
1888        ttf_data.extend_from_slice(&[
1889            // Kern table header
1890            0x00, 0x00, // version: 0
1891            0x00, 0x01, // nTables: 1
1892            // Subtable header
1893            0x00, 0x00, // version: 0
1894            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1895            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1896            0x00, 0x02, // nPairs: 2
1897            0x00, 0x08, // searchRange: 8
1898            0x00, 0x00, // entrySelector: 0
1899            0x00, 0x04, // rangeShift: 4
1900            // Kerning pair 1: A + V → -50
1901            0x00, 0x41, // left glyph: 65 ('A')
1902            0x00, 0x56, // right glyph: 86 ('V')
1903            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1904            // Kerning pair 2: A + W → -40
1905            0x00, 0x41, // left glyph: 65 ('A')
1906            0x00, 0x57, // right glyph: 87 ('W')
1907            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1908        ]);
1909
1910        let result = parse_truetype_kern_table(&ttf_data);
1911        assert!(
1912            result.is_ok(),
1913            "Should parse minimal kern table successfully: {:?}",
1914            result.err()
1915        );
1916
1917        let kerning_map = result.unwrap();
1918        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1919
1920        // Verify pair 1: A + V → -50
1921        assert_eq!(
1922            kerning_map.get(&(65, 86)),
1923            Some(&-50.0),
1924            "Should have A+V kerning pair with value -50"
1925        );
1926
1927        // Verify pair 2: A + W → -40
1928        assert_eq!(
1929            kerning_map.get(&(65, 87)),
1930            Some(&-40.0),
1931            "Should have A+W kerning pair with value -40"
1932        );
1933    }
1934
1935    #[test]
1936    fn test_parse_kern_table_no_kern_table() {
1937        use crate::text::extraction_cmap::extract_truetype_kerning;
1938
1939        // TrueType font data WITHOUT a 'kern' table
1940        // Structure:
1941        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1942        // - Table directory: 1 entry for 'head' table (not 'kern')
1943        let ttf_data = vec![
1944            // Offset table
1945            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1946            0x00, 0x01, // numTables: 1
1947            0x00, 0x10, // searchRange: 16
1948            0x00, 0x00, // entrySelector: 0
1949            0x00, 0x00, // rangeShift: 0
1950            // Table directory entry: 'head' table (not 'kern')
1951            b'h', b'e', b'a', b'd', // tag: 'head'
1952            0x00, 0x00, 0x00, 0x00, // checksum
1953            0x00, 0x00, 0x00, 0x1C, // offset: 28
1954            0x00, 0x00, 0x00, 0x36, // length: 54
1955            // Mock 'head' table data (54 bytes of zeros)
1956            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1957            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1958            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1959            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1960        ];
1961
1962        let result = extract_truetype_kerning(&ttf_data);
1963        assert!(
1964            result.is_ok(),
1965            "Should gracefully handle missing kern table"
1966        );
1967
1968        let kerning_map = result.unwrap();
1969        assert!(
1970            kerning_map.is_empty(),
1971            "Should return empty HashMap when no kern table exists"
1972        );
1973    }
1974}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs