oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19    /// Preserve the original layout (spacing and positioning)
20    pub preserve_layout: bool,
21    /// Minimum space width to insert space character (in text space units)
22    pub space_threshold: f64,
23    /// Minimum vertical distance to insert newline (in text space units)
24    pub newline_threshold: f64,
25    /// Sort text fragments by position (useful for multi-column layouts)
26    pub sort_by_position: bool,
27    /// Detect and handle columns
28    pub detect_columns: bool,
29    /// Column separation threshold (in page units)
30    pub column_threshold: f64,
31    /// Merge hyphenated words at line ends
32    pub merge_hyphenated: bool,
33    /// Track space insertion decisions in each TextFragment (default: false).
34    /// When false: zero overhead. When true: populates `TextFragment::space_decisions`.
35    pub track_space_decisions: bool,
36}
37
38impl Default for ExtractionOptions {
39    fn default() -> Self {
40        Self {
41            preserve_layout: false,
42            space_threshold: 0.3,
43            newline_threshold: 10.0,
44            sort_by_position: true,
45            detect_columns: false,
46            column_threshold: 50.0,
47            merge_hyphenated: true,
48            track_space_decisions: false,
49        }
50    }
51}
52
53/// Extracted text with position information
54#[derive(Debug, Clone)]
55pub struct ExtractedText {
56    /// The extracted text content
57    pub text: String,
58    /// Text fragments with position information (if preserve_layout is true)
59    pub fragments: Vec<TextFragment>,
60}
61
62/// Metadata about a space insertion decision during text extraction.
63/// Only populated when [`ExtractionOptions::track_space_decisions`] is `true`.
64#[derive(Debug, Clone)]
65pub struct SpaceDecision {
66    /// Character offset in the extracted text.
67    pub offset: usize,
68    /// Actual horizontal gap (dx) in text space units.
69    pub dx: f64,
70    /// The threshold used at this point.
71    pub threshold: f64,
72    /// Confidence: `|dx - threshold| / threshold`, clamped to [0.0, 1.0].
73    pub confidence: f64,
74    /// Whether a space was inserted.
75    pub inserted: bool,
76}
77
78/// A fragment of text with position information
79#[derive(Debug, Clone)]
80pub struct TextFragment {
81    /// Text content
82    pub text: String,
83    /// X position in page coordinates
84    pub x: f64,
85    /// Y position in page coordinates
86    pub y: f64,
87    /// Width of the text
88    pub width: f64,
89    /// Height of the text
90    pub height: f64,
91    /// Font size
92    pub font_size: f64,
93    /// Font name (if known) - used for kerning-aware text spacing
94    pub font_name: Option<String>,
95    /// Whether the font is bold (detected from font name)
96    pub is_bold: bool,
97    /// Whether the font is italic (detected from font name)
98    pub is_italic: bool,
99    /// Fill color of the text (from graphics state)
100    pub color: Option<Color>,
101    /// Space insertion decisions (empty unless `track_space_decisions` is true).
102    pub space_decisions: Vec<SpaceDecision>,
103}
104
105/// Text extraction state
106struct TextState {
107    /// Current text matrix
108    text_matrix: [f64; 6],
109    /// Current text line matrix
110    text_line_matrix: [f64; 6],
111    /// Current transformation matrix (CTM)
112    ctm: [f64; 6],
113    /// Text leading (line spacing)
114    leading: f64,
115    /// Character spacing
116    char_space: f64,
117    /// Word spacing
118    word_space: f64,
119    /// Horizontal scaling
120    horizontal_scale: f64,
121    /// Text rise
122    text_rise: f64,
123    /// Current font size
124    font_size: f64,
125    /// Current font name
126    font_name: Option<String>,
127    /// Render mode (0 = fill, 1 = stroke, etc.)
128    render_mode: u8,
129    /// Fill color (for text rendering)
130    fill_color: Option<Color>,
131}
132
133impl Default for TextState {
134    fn default() -> Self {
135        Self {
136            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
137            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
138            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139            leading: 0.0,
140            char_space: 0.0,
141            word_space: 0.0,
142            horizontal_scale: 100.0,
143            text_rise: 0.0,
144            font_size: 0.0,
145            font_name: None,
146            render_mode: 0,
147            fill_color: None,
148        }
149    }
150}
151
152/// Parse font style (bold/italic) from font name
153///
154/// Detects bold and italic styles from common font naming patterns.
155/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
156/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
157///
158/// # Examples
159///
160/// ```
161/// use oxidize_pdf::text::extraction::parse_font_style;
162///
163/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
164/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
165/// assert_eq!(parse_font_style("Courier"), (false, false));
166/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
167/// ```
168///
169/// # Returns
170///
171/// Tuple of (is_bold, is_italic)
172pub fn parse_font_style(font_name: &str) -> (bool, bool) {
173    let name_lower = font_name.to_lowercase();
174
175    // Detect bold from common patterns
176    let is_bold = name_lower.contains("bold")
177        || name_lower.contains("-b")
178        || name_lower.contains(" b ")
179        || name_lower.ends_with(" b");
180
181    // Detect italic/oblique from common patterns
182    let is_italic = name_lower.contains("italic")
183        || name_lower.contains("oblique")
184        || name_lower.contains("-i")
185        || name_lower.contains(" i ")
186        || name_lower.ends_with(" i");
187
188    (is_bold, is_italic)
189}
190
191/// Text extractor for PDF pages with CMap support
192pub struct TextExtractor {
193    options: ExtractionOptions,
194    /// Font cache for the current page (name-keyed, rebuilt per page since names are page-local)
195    font_cache: HashMap<String, FontInfo>,
196    /// Persistent font cache keyed by PDF object reference — avoids re-parsing the same font
197    /// object across pages. Most multi-page PDFs reuse the same font objects.
198    font_object_cache: HashMap<(u32, u16), FontInfo>,
199}
200
201impl TextExtractor {
202    /// Create a new text extractor with default options
203    pub fn new() -> Self {
204        Self {
205            options: ExtractionOptions::default(),
206            font_cache: HashMap::new(),
207            font_object_cache: HashMap::new(),
208        }
209    }
210
211    /// Create a text extractor with custom options
212    pub fn with_options(options: ExtractionOptions) -> Self {
213        Self {
214            options,
215            font_cache: HashMap::new(),
216            font_object_cache: HashMap::new(),
217        }
218    }
219
220    /// Extract text from a PDF document
221    pub fn extract_from_document<R: Read + Seek>(
222        &mut self,
223        document: &PdfDocument<R>,
224    ) -> ParseResult<Vec<ExtractedText>> {
225        let page_count = document.page_count()?;
226        let mut results = Vec::new();
227
228        for i in 0..page_count {
229            let text = self.extract_from_page(document, i)?;
230            results.push(text);
231        }
232
233        Ok(results)
234    }
235
236    /// Extract text from a specific page
237    pub fn extract_from_page<R: Read + Seek>(
238        &mut self,
239        document: &PdfDocument<R>,
240        page_index: u32,
241    ) -> ParseResult<ExtractedText> {
242        // Get the page
243        let page = document.get_page(page_index)?;
244
245        // Extract font resources first
246        {
247            let _span = tracing::info_span!("font_resources").entered();
248            self.extract_font_resources(&page, document)?;
249        }
250
251        // Get content streams
252        let streams = {
253            let _span = tracing::info_span!("stream_decompress").entered();
254            page.content_streams_with_document(document)?
255        };
256
257        let mut extracted_text = String::new();
258        let mut fragments = Vec::new();
259        let mut state = TextState::default();
260        let mut in_text_object = false;
261        let mut last_x = 0.0;
262        let mut last_y = 0.0;
263
264        // Process each content stream
265        for (stream_idx, stream_data) in streams.iter().enumerate() {
266            let operations = match {
267                let _span = tracing::info_span!("content_parse").entered();
268                ContentParser::parse_content(stream_data)
269            } {
270                Ok(ops) => ops,
271                Err(e) => {
272                    // Enhanced diagnostic logging for content stream parsing failures
273                    tracing::debug!(
274                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
275                        page_index + 1,
276                        stream_idx + 1,
277                        streams.len()
278                    );
279                    tracing::debug!("         Error: {}", e);
280                    tracing::debug!("         Stream size: {} bytes", stream_data.len());
281
282                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
283                    let preview_len = stream_data.len().min(100);
284                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
285                    tracing::debug!(
286                        "         Stream preview (first {} bytes): {:?}",
287                        preview_len,
288                        preview.chars().take(80).collect::<String>()
289                    );
290
291                    // Continue processing other streams
292                    continue;
293                }
294            };
295
296            let _ops_span = tracing::info_span!("text_ops_loop").entered();
297            for op in operations {
298                match op {
299                    ContentOperation::BeginText => {
300                        in_text_object = true;
301                        // Reset text matrix to identity
302                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
303                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
304                    }
305
306                    ContentOperation::EndText => {
307                        in_text_object = false;
308                    }
309
310                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
311                        state.text_matrix =
312                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
313                        state.text_line_matrix =
314                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
315                    }
316
317                    ContentOperation::MoveText(tx, ty) => {
318                        // Update text matrix by translation
319                        let new_matrix = multiply_matrix(
320                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
321                            &state.text_line_matrix,
322                        );
323                        state.text_matrix = new_matrix;
324                        state.text_line_matrix = new_matrix;
325                    }
326
327                    ContentOperation::NextLine => {
328                        // Move to next line using current leading
329                        let new_matrix = multiply_matrix(
330                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
331                            &state.text_line_matrix,
332                        );
333                        state.text_matrix = new_matrix;
334                        state.text_line_matrix = new_matrix;
335                    }
336
337                    ContentOperation::ShowText(text) => {
338                        if in_text_object {
339                            let text_bytes = &text;
340                            let decoded = self.decode_text(text_bytes, &state)?;
341
342                            // Calculate position: Apply text_matrix, then CTM
343                            // Concatenate: final = CTM × text_matrix
344                            let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
345                            let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
346
347                            // Add spacing based on position change
348                            if !extracted_text.is_empty() {
349                                let dx = x - last_x;
350                                let dy = (y - last_y).abs();
351
352                                if dy > self.options.newline_threshold {
353                                    extracted_text.push('\n');
354                                } else if dx > self.options.space_threshold * state.font_size {
355                                    extracted_text.push(' ');
356                                }
357                            }
358
359                            extracted_text.push_str(&decoded);
360
361                            // Get font info for accurate width calculation
362                            let font_info = state
363                                .font_name
364                                .as_ref()
365                                .and_then(|name| self.font_cache.get(name));
366
367                            // Calculate width once and reuse
368                            let text_width =
369                                calculate_text_width(&decoded, state.font_size, font_info);
370
371                            if self.options.preserve_layout {
372                                // Detect bold/italic from font name
373                                let (is_bold, is_italic) = state
374                                    .font_name
375                                    .as_ref()
376                                    .map(|name| parse_font_style(name))
377                                    .unwrap_or((false, false));
378
379                                fragments.push(TextFragment {
380                                    text: decoded.clone(),
381                                    x,
382                                    y,
383                                    width: text_width,
384                                    height: state.font_size,
385                                    font_size: state.font_size,
386                                    font_name: state.font_name.clone(),
387                                    is_bold,
388                                    is_italic,
389                                    color: state.fill_color,
390                                    space_decisions: Vec::new(),
391                                });
392                            }
393
394                            // Update position for next text
395                            last_x = x + text_width;
396                            last_y = y;
397
398                            // Update text matrix for next show operation
399                            let tx = text_width * state.horizontal_scale / 100.0;
400                            state.text_matrix =
401                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
402                        }
403                    }
404
405                    ContentOperation::ShowTextArray(array) => {
406                        if in_text_object {
407                            // Get font info for accurate width calculation
408                            let font_info = state
409                                .font_name
410                                .as_ref()
411                                .and_then(|name| self.font_cache.get(name));
412
413                            for item in array {
414                                match item {
415                                    TextElement::Text(text_bytes) => {
416                                        let decoded = self.decode_text(&text_bytes, &state)?;
417                                        extracted_text.push_str(&decoded);
418
419                                        // Update text matrix
420                                        let text_width = calculate_text_width(
421                                            &decoded,
422                                            state.font_size,
423                                            font_info,
424                                        );
425                                        let tx = text_width * state.horizontal_scale / 100.0;
426                                        state.text_matrix = multiply_matrix(
427                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
428                                            &state.text_matrix,
429                                        );
430                                    }
431                                    TextElement::Spacing(adjustment) => {
432                                        // Text position adjustment (negative = move left)
433                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
434                                        state.text_matrix = multiply_matrix(
435                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
436                                            &state.text_matrix,
437                                        );
438                                    }
439                                }
440                            }
441                        }
442                    }
443
444                    ContentOperation::SetFont(name, size) => {
445                        state.font_name = Some(name);
446                        state.font_size = size as f64;
447                    }
448
449                    ContentOperation::SetLeading(leading) => {
450                        state.leading = leading as f64;
451                    }
452
453                    ContentOperation::SetCharSpacing(spacing) => {
454                        state.char_space = spacing as f64;
455                    }
456
457                    ContentOperation::SetWordSpacing(spacing) => {
458                        state.word_space = spacing as f64;
459                    }
460
461                    ContentOperation::SetHorizontalScaling(scale) => {
462                        state.horizontal_scale = scale as f64;
463                    }
464
465                    ContentOperation::SetTextRise(rise) => {
466                        state.text_rise = rise as f64;
467                    }
468
469                    ContentOperation::SetTextRenderMode(mode) => {
470                        state.render_mode = mode as u8;
471                    }
472
473                    ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
474                        // Update CTM: new_ctm = concat_matrix * current_ctm
475                        let [a0, b0, c0, d0, e0, f0] = state.ctm;
476                        let a = a as f64;
477                        let b = b as f64;
478                        let c = c as f64;
479                        let d = d as f64;
480                        let e = e as f64;
481                        let f = f as f64;
482                        state.ctm = [
483                            a * a0 + b * c0,
484                            a * b0 + b * d0,
485                            c * a0 + d * c0,
486                            c * b0 + d * d0,
487                            e * a0 + f * c0 + e0,
488                            e * b0 + f * d0 + f0,
489                        ];
490                    }
491
492                    // Color operations (Phase 4: Color extraction)
493                    ContentOperation::SetNonStrokingGray(gray) => {
494                        state.fill_color = Some(Color::gray(gray as f64));
495                    }
496
497                    ContentOperation::SetNonStrokingRGB(r, g, b) => {
498                        state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
499                    }
500
501                    ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
502                        state.fill_color =
503                            Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
504                    }
505
506                    _ => {
507                        // Other operations don't affect text extraction
508                    }
509                }
510            }
511        }
512
513        {
514            let _span = tracing::info_span!("layout_finalize").entered();
515
516            // Sort and process fragments if requested
517            if self.options.sort_by_position && !fragments.is_empty() {
518                self.sort_and_merge_fragments(&mut fragments);
519            }
520
521            // Merge close fragments to eliminate spacing artifacts
522            // This is crucial for table detection and structured data extraction
523            if self.options.preserve_layout && !fragments.is_empty() {
524                fragments = self.merge_close_fragments(&fragments);
525            }
526
527            // Reconstruct text from sorted fragments if layout is preserved
528            if self.options.preserve_layout && !fragments.is_empty() {
529                extracted_text = self.reconstruct_text_from_fragments(&fragments);
530            }
531        }
532
533        Ok(ExtractedText {
534            text: extracted_text,
535            fragments,
536        })
537    }
538
539    /// Sort text fragments by position and merge them appropriately
540    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
541        // Sort fragments by Y position (top to bottom) then X position (left to right).
542        //
543        // We quantize Y into bands of `newline_threshold` width so that fragments
544        // on the "same line" get identical Y keys. This ensures the comparator is
545        // a strict total order (transitive), which Rust's sort algorithm requires.
546        // Without quantization, threshold-based "same line" detection breaks
547        // transitivity: A≈B and B≈C does NOT imply A≈C.
548        let threshold = self.options.newline_threshold;
549        fragments.sort_by(|a, b| {
550            // Quantize Y to nearest band (PDF Y increases upward, so negate first)
551            let band_a = if threshold > 0.0 {
552                (-a.y / threshold).round()
553            } else {
554                -a.y
555            };
556            let band_b = if threshold > 0.0 {
557                (-b.y / threshold).round()
558            } else {
559                -b.y
560            };
561
562            // Compare by Y band (top to bottom), then by X within same band
563            band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
564        });
565
566        // Detect columns if requested
567        if self.options.detect_columns {
568            self.detect_and_sort_columns(fragments);
569        }
570    }
571
572    /// Detect columns and re-sort fragments accordingly
573    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
574        // Group fragments by approximate Y position
575        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
576        let mut current_line: Vec<&mut TextFragment> = Vec::new();
577        let mut last_y = f64::INFINITY;
578
579        for fragment in fragments.iter_mut() {
580            let fragment_y = fragment.y;
581            if (last_y - fragment_y).abs() > self.options.newline_threshold
582                && !current_line.is_empty()
583            {
584                lines.push(current_line);
585                current_line = Vec::new();
586            }
587            current_line.push(fragment);
588            last_y = fragment_y;
589        }
590        if !current_line.is_empty() {
591            lines.push(current_line);
592        }
593
594        // Detect column boundaries
595        let mut column_boundaries = vec![0.0];
596        for line in &lines {
597            if line.len() > 1 {
598                for i in 0..line.len() - 1 {
599                    let gap = line[i + 1].x - (line[i].x + line[i].width);
600                    if gap > self.options.column_threshold {
601                        let boundary = line[i].x + line[i].width + gap / 2.0;
602                        if !column_boundaries
603                            .iter()
604                            .any(|&b| (b - boundary).abs() < 10.0)
605                        {
606                            column_boundaries.push(boundary);
607                        }
608                    }
609                }
610            }
611        }
612        column_boundaries.sort_by(|a, b| a.total_cmp(b));
613
614        // Re-sort fragments by column then Y position
615        if column_boundaries.len() > 1 {
616            fragments.sort_by(|a, b| {
617                // Determine column for each fragment
618                let col_a = column_boundaries
619                    .iter()
620                    .position(|&boundary| a.x < boundary)
621                    .unwrap_or(column_boundaries.len())
622                    - 1;
623                let col_b = column_boundaries
624                    .iter()
625                    .position(|&boundary| b.x < boundary)
626                    .unwrap_or(column_boundaries.len())
627                    - 1;
628
629                if col_a != col_b {
630                    col_a.cmp(&col_b)
631                } else {
632                    // Same column, sort by Y position
633                    b.y.total_cmp(&a.y)
634                }
635            });
636        }
637    }
638
639    /// Reconstruct text from sorted fragments
640    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
641        // First, merge consecutive fragments that are very close together
642        let merged_fragments = self.merge_close_fragments(fragments);
643
644        let mut result = String::new();
645        let mut last_y = f64::INFINITY;
646        let mut last_x = 0.0;
647        let mut last_line_ended_with_hyphen = false;
648
649        for fragment in &merged_fragments {
650            // Check if we need a newline
651            let y_diff = (last_y - fragment.y).abs();
652            if !result.is_empty() && y_diff > self.options.newline_threshold {
653                // Handle hyphenation
654                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
655                    // Remove the hyphen and don't add newline
656                    if result.ends_with('-') {
657                        result.pop();
658                    }
659                } else {
660                    result.push('\n');
661                }
662            } else if !result.is_empty() {
663                // Check if we need a space
664                let x_gap = fragment.x - last_x;
665                if x_gap > self.options.space_threshold * fragment.font_size {
666                    result.push(' ');
667                }
668            }
669
670            result.push_str(&fragment.text);
671            last_line_ended_with_hyphen = fragment.text.ends_with('-');
672            last_y = fragment.y;
673            last_x = fragment.x + fragment.width;
674        }
675
676        result
677    }
678
679    /// Merge fragments that are very close together on the same line
680    /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
681    fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
682        if fragments.is_empty() {
683            return Vec::new();
684        }
685
686        let mut merged = Vec::new();
687        let mut current = fragments[0].clone();
688
689        for fragment in &fragments[1..] {
690            // Check if this fragment is on the same line and very close
691            let y_diff = (current.y - fragment.y).abs();
692            let x_gap = fragment.x - (current.x + current.width);
693
694            // Merge if on same line and gap is less than a character width
695            // Use 0.5 * font_size as threshold - this catches most artificial spacing
696            let should_merge = y_diff < 1.0  // Same line (very tight tolerance)
697                && x_gap >= 0.0  // Fragment is to the right
698                && x_gap < fragment.font_size * 0.5; // Gap less than 50% of font size
699
700            if should_merge {
701                // Merge this fragment into current
702                current.text.push_str(&fragment.text);
703                current.width = (fragment.x + fragment.width) - current.x;
704            } else {
705                // Start a new fragment
706                merged.push(current);
707                current = fragment.clone();
708            }
709        }
710
711        merged.push(current);
712        merged
713    }
714
715    /// Extract font resources from page
716    ///
717    /// Clears the per-page name cache (font names are page-local in PDF), but
718    /// reuses previously parsed font objects via `font_object_cache` to avoid
719    /// re-parsing the same font object across multiple pages.
720    fn extract_font_resources<R: Read + Seek>(
721        &mut self,
722        page: &ParsedPage,
723        document: &PdfDocument<R>,
724    ) -> ParseResult<()> {
725        // Clear per-page name mapping (font names like /F1 are page-local)
726        self.font_cache.clear();
727
728        // Try to get resources manually from page dictionary first
729        // This is necessary because ParsedPage.get_resources() may not always work
730        if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
731            if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
732            {
733                if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
734                    for (font_name, font_obj) in font_dict.0.iter() {
735                        if let Some(font_ref) = font_obj.as_reference() {
736                            self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
737                        }
738                    }
739                }
740            }
741        } else if let Some(resources) = page.get_resources() {
742            // Fallback to get_resources() if Resources is not a reference
743            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
744                for (font_name, font_obj) in font_dict.0.iter() {
745                    if let Some(font_ref) = font_obj.as_reference() {
746                        self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
747                    }
748                }
749            }
750        }
751
752        Ok(())
753    }
754
755    /// Cache a font, reusing the persistent object cache when possible.
756    fn cache_font_by_ref<R: Read + Seek>(
757        &mut self,
758        font_name: &str,
759        font_ref: (u32, u16),
760        document: &PdfDocument<R>,
761    ) {
762        // Check persistent object cache first — avoids re-parsing across pages
763        if let Some(cached) = self.font_object_cache.get(&font_ref) {
764            self.font_cache
765                .insert(font_name.to_string(), cached.clone());
766            tracing::debug!(
767                "Reused cached font object ({}, {}): {} (ToUnicode: {})",
768                font_ref.0,
769                font_ref.1,
770                font_name,
771                cached.to_unicode.is_some()
772            );
773            return;
774        }
775
776        // Parse font object
777        if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
778            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
779            if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
780                let has_to_unicode = font_info.to_unicode.is_some();
781                // Store in persistent cache
782                self.font_object_cache.insert(font_ref, font_info.clone());
783                // Store in per-page name cache
784                self.font_cache.insert(font_name.to_string(), font_info);
785                tracing::debug!(
786                    "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
787                    font_ref.0,
788                    font_ref.1,
789                    font_name,
790                    has_to_unicode
791                );
792            }
793        }
794    }
795
796    /// Decode text using the current font encoding and ToUnicode mapping
797    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
798        use crate::text::encoding::TextEncoding;
799
800        // First, try to use cached font information with ToUnicode CMap
801        if let Some(ref font_name) = state.font_name {
802            if let Some(font_info) = self.font_cache.get(font_name) {
803                // Try CMap-based decoding first (free function — no allocation)
804                if let Ok(decoded) =
805                    crate::text::extraction_cmap::decode_text_with_font(text, font_info)
806                {
807                    // Only accept if we got meaningful text (not all null bytes or garbage)
808                    if !decoded.trim().is_empty()
809                        && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
810                    {
811                        // Apply sanitization to remove control characters (Issue #116)
812                        let sanitized = sanitize_extracted_text(&decoded);
813                        tracing::debug!(
814                            "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
815                            font_name,
816                            text,
817                            sanitized
818                        );
819                        return Ok(sanitized);
820                    }
821                }
822
823                tracing::debug!(
824                    "CMap decoding failed or produced garbage for font {}, falling back to encoding",
825                    font_name
826                );
827            }
828        }
829
830        // Fall back to encoding-based decoding
831        let encoding = if let Some(ref font_name) = state.font_name {
832            match font_name.to_lowercase().as_str() {
833                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
834                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
835                name if name.contains("standard") => TextEncoding::StandardEncoding,
836                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
837                _ => {
838                    // Default based on common patterns
839                    if font_name.starts_with("Times")
840                        || font_name.starts_with("Helvetica")
841                        || font_name.starts_with("Courier")
842                    {
843                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
844                    } else {
845                        TextEncoding::PdfDocEncoding // Safe default
846                    }
847                }
848            }
849        } else {
850            TextEncoding::WinAnsiEncoding // Default for most PDFs
851        };
852
853        let fallback_result = encoding.decode(text);
854        // Apply sanitization to remove control characters (Issue #116)
855        let sanitized = sanitize_extracted_text(&fallback_result);
856        tracing::debug!(
857            "Fallback encoding decoding: {:?} -> \"{}\"",
858            text,
859            sanitized
860        );
861        Ok(sanitized)
862    }
863}
864
865impl Default for TextExtractor {
866    fn default() -> Self {
867        Self::new()
868    }
869}
870
871/// Multiply two transformation matrices
872fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
873    [
874        a[0] * b[0] + a[1] * b[2],
875        a[0] * b[1] + a[1] * b[3],
876        a[2] * b[0] + a[3] * b[2],
877        a[2] * b[1] + a[3] * b[3],
878        a[4] * b[0] + a[5] * b[2] + b[4],
879        a[4] * b[1] + a[5] * b[3] + b[5],
880    ]
881}
882
883/// Transform a point using a transformation matrix
884fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
885    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
886    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
887    (tx, ty)
888}
889
890/// Calculate text width using actual font metrics (including kerning)
891fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
892    // If we have font metrics, use them for accurate width calculation
893    if let Some(font) = font_info {
894        if let Some(ref widths) = font.metrics.widths {
895            let first_char = font.metrics.first_char.unwrap_or(0);
896            let last_char = font.metrics.last_char.unwrap_or(255);
897            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
898
899            let mut total_width = 0.0;
900            let mut chars = text.chars().peekable();
901
902            while let Some(ch) = chars.next() {
903                let char_code = ch as u32;
904
905                // Get width from Widths array or use missing_width
906                let width = if char_code >= first_char && char_code <= last_char {
907                    let index = (char_code - first_char) as usize;
908                    widths.get(index).copied().unwrap_or(missing_width)
909                } else {
910                    missing_width
911                };
912
913                // Convert from glyph space (1/1000 units) to user space
914                total_width += width / 1000.0 * font_size;
915
916                // Apply kerning if available (for character pairs)
917                if let Some(ref kerning) = font.metrics.kerning {
918                    if let Some(&next_ch) = chars.peek() {
919                        let next_char = next_ch as u32;
920                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
921                            // Kerning is in FUnits (1/1000), convert to user space
922                            total_width += kern_value / 1000.0 * font_size;
923                        }
924                    }
925                }
926            }
927
928            return total_width;
929        }
930    }
931
932    // Fallback to simplified calculation if no metrics available
933    text.len() as f64 * font_size * 0.5
934}
935
936/// Sanitize extracted text by removing or replacing control characters.
937///
938/// This function addresses Issue #116 where extracted text contains NUL bytes (`\0`)
939/// and ETX characters (`\u{3}`) where spaces should appear.
940///
941/// # Behavior
942///
943/// - Replaces `\0\u{3}` sequences with a single space (common word separator pattern)
944/// - Replaces standalone `\0` (NUL) with space
945/// - Removes other ASCII control characters (0x01-0x1F) except:
946///   - `\t` (0x09) - Tab
947///   - `\n` (0x0A) - Line feed
948///   - `\r` (0x0D) - Carriage return
949/// - Collapses multiple consecutive spaces into a single space
950///
951/// # Examples
952///
953/// ```
954/// use oxidize_pdf::text::extraction::sanitize_extracted_text;
955///
956/// // Issue #116 pattern: NUL+ETX as word separator
957/// let dirty = "a\0\u{3}sergeant\0\u{3}and";
958/// assert_eq!(sanitize_extracted_text(dirty), "a sergeant and");
959///
960/// // Standalone NUL becomes space
961/// let with_nul = "word\0another";
962/// assert_eq!(sanitize_extracted_text(with_nul), "word another");
963///
964/// // Clean text passes through unchanged
965/// let clean = "Normal text";
966/// assert_eq!(sanitize_extracted_text(clean), "Normal text");
967/// ```
968pub fn sanitize_extracted_text(text: &str) -> String {
969    if text.is_empty() {
970        return String::new();
971    }
972
973    // Pre-allocate with same capacity (result will be <= input length)
974    let mut result = String::with_capacity(text.len());
975    let mut chars = text.chars().peekable();
976    let mut last_was_space = false;
977
978    while let Some(ch) = chars.next() {
979        match ch {
980            // NUL byte - check if followed by ETX for the \0\u{3} pattern
981            '\0' => {
982                // Peek at next char to detect \0\u{3} sequence
983                if chars.peek() == Some(&'\u{3}') {
984                    chars.next(); // consume the ETX
985                }
986                // In both cases (standalone NUL or NUL+ETX), emit space
987                if !last_was_space {
988                    result.push(' ');
989                    last_was_space = true;
990                }
991            }
992
993            // ETX alone (not preceded by NUL) - remove it
994            '\u{3}' => {
995                // Don't emit anything, just skip
996            }
997
998            // Preserve allowed whitespace
999            '\t' | '\n' | '\r' => {
1000                result.push(ch);
1001                // Reset space tracking on newlines but not tabs
1002                last_was_space = ch == '\t';
1003            }
1004
1005            // Regular space - collapse multiples
1006            ' ' => {
1007                if !last_was_space {
1008                    result.push(' ');
1009                    last_was_space = true;
1010                }
1011            }
1012
1013            // Other control characters (0x01-0x1F except tab/newline/CR) - remove
1014            c if c.is_ascii_control() => {
1015                // Skip control characters
1016            }
1017
1018            // Normal characters - keep them
1019            _ => {
1020                result.push(ch);
1021                last_was_space = false;
1022            }
1023        }
1024    }
1025
1026    result
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031    use super::*;
1032
1033    #[test]
1034    fn test_matrix_multiplication() {
1035        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1036        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1037
1038        let result = multiply_matrix(&identity, &translation);
1039        assert_eq!(result, translation);
1040
1041        let result2 = multiply_matrix(&translation, &identity);
1042        assert_eq!(result2, translation);
1043    }
1044
1045    #[test]
1046    fn test_transform_point() {
1047        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1048        let (x, y) = transform_point(5.0, 5.0, &translation);
1049        assert_eq!(x, 15.0);
1050        assert_eq!(y, 25.0);
1051    }
1052
1053    #[test]
1054    fn test_extraction_options_default() {
1055        let options = ExtractionOptions::default();
1056        assert!(!options.preserve_layout);
1057        assert_eq!(options.space_threshold, 0.3);
1058        assert_eq!(options.newline_threshold, 10.0);
1059        assert!(options.sort_by_position);
1060        assert!(!options.detect_columns);
1061        assert_eq!(options.column_threshold, 50.0);
1062        assert!(options.merge_hyphenated);
1063    }
1064
1065    #[test]
1066    fn test_extraction_options_custom() {
1067        let options = ExtractionOptions {
1068            preserve_layout: true,
1069            space_threshold: 0.5,
1070            newline_threshold: 15.0,
1071            sort_by_position: false,
1072            detect_columns: true,
1073            column_threshold: 75.0,
1074            merge_hyphenated: false,
1075            track_space_decisions: false,
1076        };
1077        assert!(options.preserve_layout);
1078        assert_eq!(options.space_threshold, 0.5);
1079        assert_eq!(options.newline_threshold, 15.0);
1080        assert!(!options.sort_by_position);
1081        assert!(options.detect_columns);
1082        assert_eq!(options.column_threshold, 75.0);
1083        assert!(!options.merge_hyphenated);
1084    }
1085
1086    #[test]
1087    fn test_parse_font_style_bold() {
1088        // PostScript style
1089        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1090        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1091
1092        // TrueType style
1093        assert_eq!(parse_font_style("Arial Bold"), (true, false));
1094        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1095
1096        // Short form
1097        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1098    }
1099
1100    #[test]
1101    fn test_parse_font_style_italic() {
1102        // PostScript style
1103        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1104        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1105
1106        // TrueType style
1107        assert_eq!(parse_font_style("Arial Italic"), (false, true));
1108        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1109
1110        // Short form
1111        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1112    }
1113
1114    #[test]
1115    fn test_parse_font_style_bold_italic() {
1116        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1117        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1118        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1119    }
1120
1121    #[test]
1122    fn test_parse_font_style_regular() {
1123        assert_eq!(parse_font_style("Helvetica"), (false, false));
1124        assert_eq!(parse_font_style("Times-Roman"), (false, false));
1125        assert_eq!(parse_font_style("Courier"), (false, false));
1126        assert_eq!(parse_font_style("Arial"), (false, false));
1127    }
1128
1129    #[test]
1130    fn test_parse_font_style_edge_cases() {
1131        // Empty and unusual cases
1132        assert_eq!(parse_font_style(""), (false, false));
1133        assert_eq!(parse_font_style("UnknownFont"), (false, false));
1134
1135        // Case insensitive
1136        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1137        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1138    }
1139
1140    #[test]
1141    fn test_text_fragment() {
1142        let fragment = TextFragment {
1143            text: "Hello".to_string(),
1144            x: 100.0,
1145            y: 200.0,
1146            width: 50.0,
1147            height: 12.0,
1148            font_size: 10.0,
1149            font_name: None,
1150            is_bold: false,
1151            is_italic: false,
1152            color: None,
1153            space_decisions: Vec::new(),
1154        };
1155        assert_eq!(fragment.text, "Hello");
1156        assert_eq!(fragment.x, 100.0);
1157        assert_eq!(fragment.y, 200.0);
1158        assert_eq!(fragment.width, 50.0);
1159        assert_eq!(fragment.height, 12.0);
1160        assert_eq!(fragment.font_size, 10.0);
1161    }
1162
1163    #[test]
1164    fn test_extracted_text() {
1165        let fragments = vec![
1166            TextFragment {
1167                text: "Hello".to_string(),
1168                x: 100.0,
1169                y: 200.0,
1170                width: 50.0,
1171                height: 12.0,
1172                font_size: 10.0,
1173                font_name: None,
1174                is_bold: false,
1175                is_italic: false,
1176                color: None,
1177                space_decisions: Vec::new(),
1178            },
1179            TextFragment {
1180                text: "World".to_string(),
1181                x: 160.0,
1182                y: 200.0,
1183                width: 50.0,
1184                height: 12.0,
1185                font_size: 10.0,
1186                font_name: None,
1187                is_bold: false,
1188                is_italic: false,
1189                color: None,
1190                space_decisions: Vec::new(),
1191            },
1192        ];
1193
1194        let extracted = ExtractedText {
1195            text: "Hello World".to_string(),
1196            fragments: fragments,
1197        };
1198
1199        assert_eq!(extracted.text, "Hello World");
1200        assert_eq!(extracted.fragments.len(), 2);
1201        assert_eq!(extracted.fragments[0].text, "Hello");
1202        assert_eq!(extracted.fragments[1].text, "World");
1203    }
1204
1205    #[test]
1206    fn test_text_state_default() {
1207        let state = TextState::default();
1208        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1209        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1210        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1211        assert_eq!(state.leading, 0.0);
1212        assert_eq!(state.char_space, 0.0);
1213        assert_eq!(state.word_space, 0.0);
1214        assert_eq!(state.horizontal_scale, 100.0);
1215        assert_eq!(state.text_rise, 0.0);
1216        assert_eq!(state.font_size, 0.0);
1217        assert!(state.font_name.is_none());
1218        assert_eq!(state.render_mode, 0);
1219    }
1220
1221    #[test]
1222    fn test_matrix_operations() {
1223        // Test rotation matrix
1224        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
1225        let (x, y) = transform_point(1.0, 0.0, &rotation);
1226        assert_eq!(x, 0.0);
1227        assert_eq!(y, 1.0);
1228
1229        // Test scaling matrix
1230        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1231        let (x, y) = transform_point(5.0, 5.0, &scale);
1232        assert_eq!(x, 10.0);
1233        assert_eq!(y, 15.0);
1234
1235        // Test complex transformation
1236        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1237        let (x, y) = transform_point(1.0, 1.0, &complex);
1238        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
1239        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
1240    }
1241
1242    #[test]
1243    fn test_text_extractor_new() {
1244        let extractor = TextExtractor::new();
1245        let options = extractor.options;
1246        assert!(!options.preserve_layout);
1247        assert_eq!(options.space_threshold, 0.3);
1248        assert_eq!(options.newline_threshold, 10.0);
1249        assert!(options.sort_by_position);
1250        assert!(!options.detect_columns);
1251        assert_eq!(options.column_threshold, 50.0);
1252        assert!(options.merge_hyphenated);
1253    }
1254
1255    #[test]
1256    fn test_text_extractor_with_options() {
1257        let options = ExtractionOptions {
1258            preserve_layout: true,
1259            space_threshold: 0.3,
1260            newline_threshold: 12.0,
1261            sort_by_position: false,
1262            detect_columns: true,
1263            column_threshold: 60.0,
1264            merge_hyphenated: false,
1265            track_space_decisions: false,
1266        };
1267        let extractor = TextExtractor::with_options(options.clone());
1268        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1269        assert_eq!(extractor.options.space_threshold, options.space_threshold);
1270        assert_eq!(
1271            extractor.options.newline_threshold,
1272            options.newline_threshold
1273        );
1274        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1275        assert_eq!(extractor.options.detect_columns, options.detect_columns);
1276        assert_eq!(extractor.options.column_threshold, options.column_threshold);
1277        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1278    }
1279
1280    // =========================================================================
1281    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
1282    // =========================================================================
1283
1284    #[test]
1285    fn test_calculate_text_width_with_no_font_info() {
1286        // Test fallback: should use simplified calculation
1287        let width = calculate_text_width("Hello", 12.0, None);
1288
1289        // Expected: 5 chars * 12.0 * 0.5 = 30.0
1290        assert_eq!(
1291            width, 30.0,
1292            "Without font info, should use simplified calculation: len * font_size * 0.5"
1293        );
1294    }
1295
1296    #[test]
1297    fn test_calculate_text_width_with_empty_metrics() {
1298        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1299
1300        // Font with no widths array
1301        let font_info = FontInfo {
1302            name: "TestFont".to_string(),
1303            font_type: "Type1".to_string(),
1304            encoding: None,
1305            to_unicode: None,
1306            differences: None,
1307            descendant_font: None,
1308            cid_to_gid_map: None,
1309            metrics: FontMetrics {
1310                first_char: None,
1311                last_char: None,
1312                widths: None,
1313                missing_width: Some(500.0),
1314                kerning: None,
1315            },
1316        };
1317
1318        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1319
1320        // Should fall back to simplified calculation
1321        assert_eq!(
1322            width, 30.0,
1323            "Without widths array, should fall back to simplified calculation"
1324        );
1325    }
1326
1327    #[test]
1328    fn test_calculate_text_width_with_complete_metrics() {
1329        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1330
1331        // Font with complete metrics for ASCII range 32-126
1332        // Simulate typical Helvetica widths (in 1/1000 units)
1333        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
1334
1335        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
1336        widths[72 - 32] = 722.0; // 'H' is ASCII 72
1337        widths[101 - 32] = 556.0; // 'e' is ASCII 101
1338        widths[108 - 32] = 278.0; // 'l' is ASCII 108
1339        widths[111 - 32] = 611.0; // 'o' is ASCII 111
1340
1341        let font_info = FontInfo {
1342            name: "Helvetica".to_string(),
1343            font_type: "Type1".to_string(),
1344            encoding: None,
1345            to_unicode: None,
1346            differences: None,
1347            descendant_font: None,
1348            cid_to_gid_map: None,
1349            metrics: FontMetrics {
1350                first_char: Some(32),
1351                last_char: Some(126),
1352                widths: Some(widths),
1353                missing_width: Some(500.0),
1354                kerning: None,
1355            },
1356        };
1357
1358        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1359
1360        // Expected calculation (widths in glyph space / 1000 * font_size):
1361        // H: 722/1000 * 12 = 8.664
1362        // e: 556/1000 * 12 = 6.672
1363        // l: 278/1000 * 12 = 3.336
1364        // l: 278/1000 * 12 = 3.336
1365        // o: 611/1000 * 12 = 7.332
1366        // Total: 29.34
1367        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1368        let tolerance = 0.0001; // Floating point tolerance
1369        assert!(
1370            (width - expected).abs() < tolerance,
1371            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1372            expected,
1373            width,
1374            (width - expected).abs()
1375        );
1376
1377        // Verify it's different from simplified calculation
1378        let simplified = 5.0 * 12.0 * 0.5; // 30.0
1379        assert_ne!(
1380            width, simplified,
1381            "Metrics-based calculation should differ from simplified (30.0)"
1382        );
1383    }
1384
1385    #[test]
1386    fn test_calculate_text_width_character_outside_range() {
1387        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1388
1389        // Font with narrow range (only covers 'A'-'Z')
1390        let widths = vec![722.0; 26]; // All uppercase letters same width
1391
1392        let font_info = FontInfo {
1393            name: "TestFont".to_string(),
1394            font_type: "Type1".to_string(),
1395            encoding: None,
1396            to_unicode: None,
1397            differences: None,
1398            descendant_font: None,
1399            cid_to_gid_map: None,
1400            metrics: FontMetrics {
1401                first_char: Some(65), // 'A'
1402                last_char: Some(90),  // 'Z'
1403                widths: Some(widths),
1404                missing_width: Some(500.0),
1405                kerning: None,
1406            },
1407        };
1408
1409        // Test with character outside range
1410        let width = calculate_text_width("A1", 10.0, Some(&font_info));
1411
1412        // Expected:
1413        // 'A' (65) is in range: 722/1000 * 10 = 7.22
1414        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
1415        // Total: 12.22
1416        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1417        assert_eq!(
1418            width, expected,
1419            "Should use missing_width for characters outside range"
1420        );
1421    }
1422
1423    #[test]
1424    fn test_calculate_text_width_missing_width_in_array() {
1425        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1426
1427        // Font with incomplete widths array (some characters have 0.0)
1428        let mut widths = vec![500.0; 95]; // Default width
1429        widths[10] = 0.0; // Character at index 10 has no width defined
1430
1431        let font_info = FontInfo {
1432            name: "TestFont".to_string(),
1433            font_type: "Type1".to_string(),
1434            encoding: None,
1435            to_unicode: None,
1436            differences: None,
1437            descendant_font: None,
1438            cid_to_gid_map: None,
1439            metrics: FontMetrics {
1440                first_char: Some(32),
1441                last_char: Some(126),
1442                widths: Some(widths),
1443                missing_width: Some(600.0),
1444                kerning: None,
1445            },
1446        };
1447
1448        // Character 42 (index 10 from first_char 32)
1449        let char_code = 42u8 as char; // '*'
1450        let text = char_code.to_string();
1451        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1452
1453        // Character is in range but width is 0.0, should NOT fall back to missing_width
1454        // (0.0 is a valid width for zero-width characters)
1455        assert_eq!(
1456            width, 0.0,
1457            "Should use 0.0 width from array, not missing_width"
1458        );
1459    }
1460
1461    #[test]
1462    fn test_calculate_text_width_empty_string() {
1463        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1464
1465        let font_info = FontInfo {
1466            name: "TestFont".to_string(),
1467            font_type: "Type1".to_string(),
1468            encoding: None,
1469            to_unicode: None,
1470            differences: None,
1471            descendant_font: None,
1472            cid_to_gid_map: None,
1473            metrics: FontMetrics {
1474                first_char: Some(32),
1475                last_char: Some(126),
1476                widths: Some(vec![500.0; 95]),
1477                missing_width: Some(500.0),
1478                kerning: None,
1479            },
1480        };
1481
1482        let width = calculate_text_width("", 12.0, Some(&font_info));
1483        assert_eq!(width, 0.0, "Empty string should have zero width");
1484
1485        // Also test without font info
1486        let width_no_font = calculate_text_width("", 12.0, None);
1487        assert_eq!(
1488            width_no_font, 0.0,
1489            "Empty string should have zero width (no font)"
1490        );
1491    }
1492
1493    #[test]
1494    fn test_calculate_text_width_unicode_characters() {
1495        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1496
1497        // Font with limited ASCII range
1498        let font_info = FontInfo {
1499            name: "TestFont".to_string(),
1500            font_type: "Type1".to_string(),
1501            encoding: None,
1502            to_unicode: None,
1503            differences: None,
1504            descendant_font: None,
1505            cid_to_gid_map: None,
1506            metrics: FontMetrics {
1507                first_char: Some(32),
1508                last_char: Some(126),
1509                widths: Some(vec![500.0; 95]),
1510                missing_width: Some(600.0),
1511                kerning: None,
1512            },
1513        };
1514
1515        // Test with Unicode characters outside ASCII range
1516        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1517
1518        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1519        // Expected: 600/1000 * 10 = 6.0
1520        assert_eq!(
1521            width, 6.0,
1522            "Unicode character outside range should use missing_width"
1523        );
1524    }
1525
1526    #[test]
1527    fn test_calculate_text_width_different_font_sizes() {
1528        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1529
1530        let font_info = FontInfo {
1531            name: "TestFont".to_string(),
1532            font_type: "Type1".to_string(),
1533            encoding: None,
1534            to_unicode: None,
1535            differences: None,
1536            descendant_font: None,
1537            cid_to_gid_map: None,
1538            metrics: FontMetrics {
1539                first_char: Some(65), // 'A'
1540                last_char: Some(65),  // 'A'
1541                widths: Some(vec![722.0]),
1542                missing_width: Some(500.0),
1543                kerning: None,
1544            },
1545        };
1546
1547        // Test same character with different font sizes
1548        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1549        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1550
1551        // Widths should scale linearly with font size
1552        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1553        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1554        assert_eq!(
1555            width_20,
1556            width_10 * 2.0,
1557            "Width should scale linearly with font size"
1558        );
1559    }
1560
1561    #[test]
1562    fn test_calculate_text_width_proportional_vs_monospace() {
1563        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1564
1565        // Simulate proportional font (different widths)
1566        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1567        let proportional_font = FontInfo {
1568            name: "Helvetica".to_string(),
1569            font_type: "Type1".to_string(),
1570            encoding: None,
1571            to_unicode: None,
1572            differences: None,
1573            descendant_font: None,
1574            cid_to_gid_map: None,
1575            metrics: FontMetrics {
1576                first_char: Some(105), // 'i'
1577                last_char: Some(107),  // covers i, j, k
1578                widths: Some(proportional_widths),
1579                missing_width: Some(500.0),
1580                kerning: None,
1581            },
1582        };
1583
1584        // Simulate monospace font (same width)
1585        let monospace_widths = vec![600.0, 600.0, 600.0];
1586        let monospace_font = FontInfo {
1587            name: "Courier".to_string(),
1588            font_type: "Type1".to_string(),
1589            encoding: None,
1590            to_unicode: None,
1591            differences: None,
1592            descendant_font: None,
1593            cid_to_gid_map: None,
1594            metrics: FontMetrics {
1595                first_char: Some(105),
1596                last_char: Some(107),
1597                widths: Some(monospace_widths),
1598                missing_width: Some(600.0),
1599                kerning: None,
1600            },
1601        };
1602
1603        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1604        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1605
1606        // Proportional 'i' should be narrower than monospace 'i'
1607        assert!(
1608            prop_width < mono_width,
1609            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1610            prop_width,
1611            mono_width
1612        );
1613    }
1614
1615    // =========================================================================
1616    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1617    // =========================================================================
1618
1619    #[test]
1620    fn test_calculate_text_width_with_kerning() {
1621        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1622        use std::collections::HashMap;
1623
1624        // Create a font with kerning pairs
1625        let mut widths = vec![500.0; 95]; // ASCII 32-126
1626        widths[65 - 32] = 722.0; // 'A'
1627        widths[86 - 32] = 722.0; // 'V'
1628        widths[87 - 32] = 944.0; // 'W'
1629
1630        let mut kerning = HashMap::new();
1631        // Typical kerning pairs (in FUnits, 1/1000)
1632        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1633        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1634
1635        let font_info = FontInfo {
1636            name: "Helvetica".to_string(),
1637            font_type: "Type1".to_string(),
1638            encoding: None,
1639            to_unicode: None,
1640            differences: None,
1641            descendant_font: None,
1642            cid_to_gid_map: None,
1643            metrics: FontMetrics {
1644                first_char: Some(32),
1645                last_char: Some(126),
1646                widths: Some(widths),
1647                missing_width: Some(500.0),
1648                kerning: Some(kerning),
1649            },
1650        };
1651
1652        // Test "AV" with kerning
1653        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1654        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1655        //         = 17.328 - 0.6 = 16.728
1656        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1657        let tolerance = 0.0001;
1658        assert!(
1659            (width_av - expected_av).abs() < tolerance,
1660            "AV with kerning: expected {}, got {}, diff {}",
1661            expected_av,
1662            width_av,
1663            (width_av - expected_av).abs()
1664        );
1665
1666        // Test "AW" with different kerning value
1667        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1668        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1669        //         = 19.992 - 0.48 = 19.512
1670        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1671        assert!(
1672            (width_aw - expected_aw).abs() < tolerance,
1673            "AW with kerning: expected {}, got {}, diff {}",
1674            expected_aw,
1675            width_aw,
1676            (width_aw - expected_aw).abs()
1677        );
1678
1679        // Test "VA" with NO kerning (pair not in HashMap)
1680        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1681        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1682        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1683        assert!(
1684            (width_va - expected_va).abs() < tolerance,
1685            "VA without kerning: expected {}, got {}, diff {}",
1686            expected_va,
1687            width_va,
1688            (width_va - expected_va).abs()
1689        );
1690
1691        // Verify kerning makes a measurable difference
1692        assert!(
1693            width_av < width_va,
1694            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1695            width_av,
1696            width_va
1697        );
1698    }
1699
1700    #[test]
1701    fn test_parse_truetype_kern_table_minimal() {
1702        use crate::text::extraction_cmap::parse_truetype_kern_table;
1703
1704        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1705        // Structure:
1706        // 1. Offset table (12 bytes)
1707        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1708        // 3. 'head' table data (54 bytes)
1709        // 4. 'kern' table data (30 bytes)
1710        // Total: 128 bytes
1711        let mut ttf_data = vec![
1712            // Offset table
1713            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1714            0x00, 0x02, // numTables: 2
1715            0x00, 0x20, // searchRange: 32
1716            0x00, 0x01, // entrySelector: 1
1717            0x00, 0x00, // rangeShift: 0
1718        ];
1719
1720        // Table directory entry 1: 'head' table
1721        ttf_data.extend_from_slice(b"head"); // tag
1722        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1723        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1724        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1725
1726        // Table directory entry 2: 'kern' table
1727        ttf_data.extend_from_slice(b"kern"); // tag
1728        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1729        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1730        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1731
1732        // 'head' table data (54 bytes of zeros - minimal valid head table)
1733        ttf_data.extend_from_slice(&[0u8; 54]);
1734
1735        // 'kern' table data (34 bytes)
1736        ttf_data.extend_from_slice(&[
1737            // Kern table header
1738            0x00, 0x00, // version: 0
1739            0x00, 0x01, // nTables: 1
1740            // Subtable header
1741            0x00, 0x00, // version: 0
1742            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1743            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1744            0x00, 0x02, // nPairs: 2
1745            0x00, 0x08, // searchRange: 8
1746            0x00, 0x00, // entrySelector: 0
1747            0x00, 0x04, // rangeShift: 4
1748            // Kerning pair 1: A + V → -50
1749            0x00, 0x41, // left glyph: 65 ('A')
1750            0x00, 0x56, // right glyph: 86 ('V')
1751            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1752            // Kerning pair 2: A + W → -40
1753            0x00, 0x41, // left glyph: 65 ('A')
1754            0x00, 0x57, // right glyph: 87 ('W')
1755            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1756        ]);
1757
1758        let result = parse_truetype_kern_table(&ttf_data);
1759        assert!(
1760            result.is_ok(),
1761            "Should parse minimal kern table successfully: {:?}",
1762            result.err()
1763        );
1764
1765        let kerning_map = result.unwrap();
1766        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1767
1768        // Verify pair 1: A + V → -50
1769        assert_eq!(
1770            kerning_map.get(&(65, 86)),
1771            Some(&-50.0),
1772            "Should have A+V kerning pair with value -50"
1773        );
1774
1775        // Verify pair 2: A + W → -40
1776        assert_eq!(
1777            kerning_map.get(&(65, 87)),
1778            Some(&-40.0),
1779            "Should have A+W kerning pair with value -40"
1780        );
1781    }
1782
1783    #[test]
1784    fn test_parse_kern_table_no_kern_table() {
1785        use crate::text::extraction_cmap::extract_truetype_kerning;
1786
1787        // TrueType font data WITHOUT a 'kern' table
1788        // Structure:
1789        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1790        // - Table directory: 1 entry for 'head' table (not 'kern')
1791        let ttf_data = vec![
1792            // Offset table
1793            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1794            0x00, 0x01, // numTables: 1
1795            0x00, 0x10, // searchRange: 16
1796            0x00, 0x00, // entrySelector: 0
1797            0x00, 0x00, // rangeShift: 0
1798            // Table directory entry: 'head' table (not 'kern')
1799            b'h', b'e', b'a', b'd', // tag: 'head'
1800            0x00, 0x00, 0x00, 0x00, // checksum
1801            0x00, 0x00, 0x00, 0x1C, // offset: 28
1802            0x00, 0x00, 0x00, 0x36, // length: 54
1803            // Mock 'head' table data (54 bytes of zeros)
1804            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1805            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1806            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1807            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1808        ];
1809
1810        let result = extract_truetype_kerning(&ttf_data);
1811        assert!(
1812            result.is_ok(),
1813            "Should gracefully handle missing kern table"
1814        );
1815
1816        let kerning_map = result.unwrap();
1817        assert!(
1818            kerning_map.is_empty(),
1819            "Should return empty HashMap when no kern table exists"
1820        );
1821    }
1822}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs