oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19    /// Preserve the original layout (spacing and positioning)
20    pub preserve_layout: bool,
21    /// Minimum space width to insert space character (in text space units)
22    pub space_threshold: f64,
23    /// Minimum vertical distance to insert newline (in text space units)
24    pub newline_threshold: f64,
25    /// Sort text fragments by position (useful for multi-column layouts)
26    pub sort_by_position: bool,
27    /// Detect and handle columns
28    pub detect_columns: bool,
29    /// Column separation threshold (in page units)
30    pub column_threshold: f64,
31    /// Merge hyphenated words at line ends
32    pub merge_hyphenated: bool,
33    /// Track space insertion decisions in each TextFragment (default: false).
34    /// When false: zero overhead. When true: populates `TextFragment::space_decisions`.
35    pub track_space_decisions: bool,
36}
37
38impl Default for ExtractionOptions {
39    fn default() -> Self {
40        Self {
41            preserve_layout: false,
42            space_threshold: 0.3,
43            newline_threshold: 10.0,
44            sort_by_position: true,
45            detect_columns: false,
46            column_threshold: 50.0,
47            merge_hyphenated: true,
48            track_space_decisions: false,
49        }
50    }
51}
52
53/// Extracted text with position information
54#[derive(Debug, Clone)]
55pub struct ExtractedText {
56    /// The extracted text content
57    pub text: String,
58    /// Text fragments with position information (if preserve_layout is true)
59    pub fragments: Vec<TextFragment>,
60}
61
62/// Metadata about a space insertion decision during text extraction.
63/// Only populated when [`ExtractionOptions::track_space_decisions`] is `true`.
64#[derive(Debug, Clone)]
65pub struct SpaceDecision {
66    /// Character offset in the extracted text.
67    pub offset: usize,
68    /// Actual horizontal gap (dx) in text space units.
69    pub dx: f64,
70    /// The threshold used at this point.
71    pub threshold: f64,
72    /// Confidence: `|dx - threshold| / threshold`, clamped to [0.0, 1.0].
73    pub confidence: f64,
74    /// Whether a space was inserted.
75    pub inserted: bool,
76}
77
78/// A fragment of text with position information
79#[derive(Debug, Clone)]
80pub struct TextFragment {
81    /// Text content
82    pub text: String,
83    /// X position in page coordinates
84    pub x: f64,
85    /// Y position in page coordinates
86    pub y: f64,
87    /// Width of the text
88    pub width: f64,
89    /// Height of the text
90    pub height: f64,
91    /// Font size
92    pub font_size: f64,
93    /// Font name (if known) - used for kerning-aware text spacing
94    pub font_name: Option<String>,
95    /// Whether the font is bold (detected from font name)
96    pub is_bold: bool,
97    /// Whether the font is italic (detected from font name)
98    pub is_italic: bool,
99    /// Fill color of the text (from graphics state)
100    pub color: Option<Color>,
101    /// Space insertion decisions (empty unless `track_space_decisions` is true).
102    pub space_decisions: Vec<SpaceDecision>,
103}
104
105/// Text extraction state
106struct TextState {
107    /// Current text matrix
108    text_matrix: [f64; 6],
109    /// Current text line matrix
110    text_line_matrix: [f64; 6],
111    /// Current transformation matrix (CTM)
112    ctm: [f64; 6],
113    /// Text leading (line spacing)
114    leading: f64,
115    /// Character spacing
116    char_space: f64,
117    /// Word spacing
118    word_space: f64,
119    /// Horizontal scaling
120    horizontal_scale: f64,
121    /// Text rise
122    text_rise: f64,
123    /// Current font size
124    font_size: f64,
125    /// Current font name
126    font_name: Option<String>,
127    /// Render mode (0 = fill, 1 = stroke, etc.)
128    render_mode: u8,
129    /// Fill color (for text rendering)
130    fill_color: Option<Color>,
131}
132
133impl Default for TextState {
134    fn default() -> Self {
135        Self {
136            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
137            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
138            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139            leading: 0.0,
140            char_space: 0.0,
141            word_space: 0.0,
142            horizontal_scale: 100.0,
143            text_rise: 0.0,
144            font_size: 0.0,
145            font_name: None,
146            render_mode: 0,
147            fill_color: None,
148        }
149    }
150}
151
152/// Parse font style (bold/italic) from font name
153///
154/// Detects bold and italic styles from common font naming patterns.
155/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
156/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
157///
158/// # Examples
159///
160/// ```
161/// use oxidize_pdf::text::extraction::parse_font_style;
162///
163/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
164/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
165/// assert_eq!(parse_font_style("Courier"), (false, false));
166/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
167/// ```
168///
169/// # Returns
170///
171/// Tuple of (is_bold, is_italic)
172pub fn parse_font_style(font_name: &str) -> (bool, bool) {
173    let name_lower = font_name.to_lowercase();
174
175    // Detect bold from common patterns
176    let is_bold = name_lower.contains("bold")
177        || name_lower.contains("-b")
178        || name_lower.contains(" b ")
179        || name_lower.ends_with(" b");
180
181    // Detect italic/oblique from common patterns
182    let is_italic = name_lower.contains("italic")
183        || name_lower.contains("oblique")
184        || name_lower.contains("-i")
185        || name_lower.contains(" i ")
186        || name_lower.ends_with(" i");
187
188    (is_bold, is_italic)
189}
190
191/// Text extractor for PDF pages with CMap support
192pub struct TextExtractor {
193    options: ExtractionOptions,
194    /// Font cache for the current page (name-keyed, rebuilt per page since names are page-local)
195    font_cache: HashMap<String, FontInfo>,
196    /// Persistent font cache keyed by PDF object reference — avoids re-parsing the same font
197    /// object across pages. Most multi-page PDFs reuse the same font objects.
198    font_object_cache: HashMap<(u32, u16), FontInfo>,
199}
200
201impl TextExtractor {
202    /// Create a new text extractor with default options
203    pub fn new() -> Self {
204        Self {
205            options: ExtractionOptions::default(),
206            font_cache: HashMap::new(),
207            font_object_cache: HashMap::new(),
208        }
209    }
210
211    /// Create a text extractor with custom options
212    pub fn with_options(options: ExtractionOptions) -> Self {
213        Self {
214            options,
215            font_cache: HashMap::new(),
216            font_object_cache: HashMap::new(),
217        }
218    }
219
220    /// Extract text from a PDF document
221    pub fn extract_from_document<R: Read + Seek>(
222        &mut self,
223        document: &PdfDocument<R>,
224    ) -> ParseResult<Vec<ExtractedText>> {
225        let page_count = document.page_count()?;
226        let mut results = Vec::new();
227
228        for i in 0..page_count {
229            let text = self.extract_from_page(document, i)?;
230            results.push(text);
231        }
232
233        Ok(results)
234    }
235
236    /// Extract text from a specific page
237    pub fn extract_from_page<R: Read + Seek>(
238        &mut self,
239        document: &PdfDocument<R>,
240        page_index: u32,
241    ) -> ParseResult<ExtractedText> {
242        // Get the page
243        let page = document.get_page(page_index)?;
244
245        // Extract font resources first
246        {
247            let _span = tracing::info_span!("font_resources").entered();
248            self.extract_font_resources(&page, document)?;
249        }
250
251        // Get content streams
252        let streams = {
253            let _span = tracing::info_span!("stream_decompress").entered();
254            page.content_streams_with_document(document)?
255        };
256
257        let mut extracted_text = String::new();
258        let mut fragments = Vec::new();
259        let mut state = TextState::default();
260        let mut in_text_object = false;
261        let mut last_x = 0.0;
262        let mut last_y = 0.0;
263
264        // Process each content stream
265        for (stream_idx, stream_data) in streams.iter().enumerate() {
266            let operations = match {
267                let _span = tracing::info_span!("content_parse").entered();
268                ContentParser::parse_content(stream_data)
269            } {
270                Ok(ops) => ops,
271                Err(e) => {
272                    // Enhanced diagnostic logging for content stream parsing failures
273                    tracing::debug!(
274                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
275                        page_index + 1,
276                        stream_idx + 1,
277                        streams.len()
278                    );
279                    tracing::debug!("         Error: {}", e);
280                    tracing::debug!("         Stream size: {} bytes", stream_data.len());
281
282                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
283                    let preview_len = stream_data.len().min(100);
284                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
285                    tracing::debug!(
286                        "         Stream preview (first {} bytes): {:?}",
287                        preview_len,
288                        preview.chars().take(80).collect::<String>()
289                    );
290
291                    // Continue processing other streams
292                    continue;
293                }
294            };
295
296            let _ops_span = tracing::info_span!("text_ops_loop").entered();
297            for op in operations {
298                match op {
299                    ContentOperation::BeginText => {
300                        in_text_object = true;
301                        // Reset text matrix to identity
302                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
303                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
304                    }
305
306                    ContentOperation::EndText => {
307                        in_text_object = false;
308                    }
309
310                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
311                        state.text_matrix =
312                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
313                        state.text_line_matrix =
314                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
315                    }
316
317                    ContentOperation::MoveText(tx, ty) => {
318                        // Update text matrix by translation
319                        let new_matrix = multiply_matrix(
320                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
321                            &state.text_line_matrix,
322                        );
323                        state.text_matrix = new_matrix;
324                        state.text_line_matrix = new_matrix;
325                    }
326
327                    ContentOperation::NextLine => {
328                        // Move to next line using current leading
329                        let new_matrix = multiply_matrix(
330                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
331                            &state.text_line_matrix,
332                        );
333                        state.text_matrix = new_matrix;
334                        state.text_line_matrix = new_matrix;
335                    }
336
337                    ContentOperation::ShowText(text) => {
338                        if in_text_object {
339                            let text_bytes = &text;
340                            let decoded = self.decode_text(text_bytes, &state)?;
341
342                            // Calculate position: Apply text_matrix, then CTM
343                            // Concatenate: final = CTM × text_matrix
344                            let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
345                            let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
346
347                            // Add spacing based on position change
348                            if !extracted_text.is_empty() {
349                                let dx = x - last_x;
350                                let dy = (y - last_y).abs();
351
352                                if dy > self.options.newline_threshold {
353                                    extracted_text.push('\n');
354                                } else if dx > self.options.space_threshold * state.font_size {
355                                    extracted_text.push(' ');
356                                }
357                            }
358
359                            extracted_text.push_str(&decoded);
360
361                            // Get font info for accurate width calculation
362                            let font_info = state
363                                .font_name
364                                .as_ref()
365                                .and_then(|name| self.font_cache.get(name));
366
367                            // Calculate width once and reuse
368                            let text_width =
369                                calculate_text_width(&decoded, state.font_size, font_info);
370
371                            if self.options.preserve_layout {
372                                // Detect bold/italic from font name
373                                let (is_bold, is_italic) = state
374                                    .font_name
375                                    .as_ref()
376                                    .map(|name| parse_font_style(name))
377                                    .unwrap_or((false, false));
378
379                                fragments.push(TextFragment {
380                                    text: decoded.clone(),
381                                    x,
382                                    y,
383                                    width: text_width,
384                                    height: state.font_size,
385                                    font_size: state.font_size,
386                                    font_name: state.font_name.clone(),
387                                    is_bold,
388                                    is_italic,
389                                    color: state.fill_color,
390                                    space_decisions: Vec::new(),
391                                });
392                            }
393
394                            // Update position for next text
395                            last_x = x + text_width;
396                            last_y = y;
397
398                            // Update text matrix for next show operation
399                            let tx = text_width * state.horizontal_scale / 100.0;
400                            state.text_matrix =
401                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
402                        }
403                    }
404
405                    ContentOperation::ShowTextArray(array) => {
406                        if in_text_object {
407                            // Get font info for accurate width calculation
408                            let font_info = state
409                                .font_name
410                                .as_ref()
411                                .and_then(|name| self.font_cache.get(name));
412
413                            for item in array {
414                                match item {
415                                    TextElement::Text(text_bytes) => {
416                                        let decoded = self.decode_text(&text_bytes, &state)?;
417                                        extracted_text.push_str(&decoded);
418
419                                        // Update text matrix
420                                        let text_width = calculate_text_width(
421                                            &decoded,
422                                            state.font_size,
423                                            font_info,
424                                        );
425                                        let tx = text_width * state.horizontal_scale / 100.0;
426                                        state.text_matrix = multiply_matrix(
427                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
428                                            &state.text_matrix,
429                                        );
430                                    }
431                                    TextElement::Spacing(adjustment) => {
432                                        // Text position adjustment (negative = move left)
433                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
434                                        state.text_matrix = multiply_matrix(
435                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
436                                            &state.text_matrix,
437                                        );
438                                    }
439                                }
440                            }
441                        }
442                    }
443
444                    ContentOperation::SetFont(name, size) => {
445                        state.font_name = Some(name);
446                        state.font_size = size as f64;
447                    }
448
449                    ContentOperation::SetLeading(leading) => {
450                        state.leading = leading as f64;
451                    }
452
453                    ContentOperation::SetCharSpacing(spacing) => {
454                        state.char_space = spacing as f64;
455                    }
456
457                    ContentOperation::SetWordSpacing(spacing) => {
458                        state.word_space = spacing as f64;
459                    }
460
461                    ContentOperation::SetHorizontalScaling(scale) => {
462                        state.horizontal_scale = scale as f64;
463                    }
464
465                    ContentOperation::SetTextRise(rise) => {
466                        state.text_rise = rise as f64;
467                    }
468
469                    ContentOperation::SetTextRenderMode(mode) => {
470                        state.render_mode = mode as u8;
471                    }
472
473                    ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
474                        // Update CTM: new_ctm = concat_matrix * current_ctm
475                        let [a0, b0, c0, d0, e0, f0] = state.ctm;
476                        let a = a as f64;
477                        let b = b as f64;
478                        let c = c as f64;
479                        let d = d as f64;
480                        let e = e as f64;
481                        let f = f as f64;
482                        state.ctm = [
483                            a * a0 + b * c0,
484                            a * b0 + b * d0,
485                            c * a0 + d * c0,
486                            c * b0 + d * d0,
487                            e * a0 + f * c0 + e0,
488                            e * b0 + f * d0 + f0,
489                        ];
490                    }
491
492                    // Color operations (Phase 4: Color extraction)
493                    ContentOperation::SetNonStrokingGray(gray) => {
494                        state.fill_color = Some(Color::gray(gray as f64));
495                    }
496
497                    ContentOperation::SetNonStrokingRGB(r, g, b) => {
498                        state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
499                    }
500
501                    ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
502                        state.fill_color =
503                            Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
504                    }
505
506                    _ => {
507                        // Other operations don't affect text extraction
508                    }
509                }
510            }
511        }
512
513        {
514            let _span = tracing::info_span!("layout_finalize").entered();
515
516            // Sort and process fragments if requested
517            if self.options.sort_by_position && !fragments.is_empty() {
518                self.sort_and_merge_fragments(&mut fragments);
519            }
520
521            // Merge close fragments to eliminate spacing artifacts
522            // This is crucial for table detection and structured data extraction
523            if self.options.preserve_layout && !fragments.is_empty() {
524                fragments = self.merge_close_fragments(&fragments);
525            }
526
527            // Reconstruct text from sorted fragments if layout is preserved
528            if self.options.preserve_layout && !fragments.is_empty() {
529                extracted_text = self.reconstruct_text_from_fragments(&fragments);
530            }
531        }
532
533        Ok(ExtractedText {
534            text: extracted_text,
535            fragments,
536        })
537    }
538
539    /// Sort text fragments by position and merge them appropriately
540    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
541        // Sort fragments by Y position (top to bottom) then X position (left to right).
542        //
543        // We quantize Y into bands of `newline_threshold` width so that fragments
544        // on the "same line" get identical Y keys. This ensures the comparator is
545        // a strict total order (transitive), which Rust's sort algorithm requires.
546        // Without quantization, threshold-based "same line" detection breaks
547        // transitivity: A≈B and B≈C does NOT imply A≈C.
548        let threshold = self.options.newline_threshold;
549        fragments.sort_by(|a, b| {
550            // Quantize Y to nearest band (PDF Y increases upward, so negate first)
551            let band_a = if threshold > 0.0 {
552                (-a.y / threshold).round()
553            } else {
554                -a.y
555            };
556            let band_b = if threshold > 0.0 {
557                (-b.y / threshold).round()
558            } else {
559                -b.y
560            };
561
562            // Compare by Y band (top to bottom), then by X within same band
563            band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
564        });
565
566        // Detect columns if requested
567        if self.options.detect_columns {
568            self.detect_and_sort_columns(fragments);
569        }
570    }
571
572    /// Detect columns and re-sort fragments accordingly
573    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
574        // Group fragments by approximate Y position
575        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
576        let mut current_line: Vec<&mut TextFragment> = Vec::new();
577        let mut last_y = f64::INFINITY;
578
579        for fragment in fragments.iter_mut() {
580            let fragment_y = fragment.y;
581            if (last_y - fragment_y).abs() > self.options.newline_threshold
582                && !current_line.is_empty()
583            {
584                lines.push(current_line);
585                current_line = Vec::new();
586            }
587            current_line.push(fragment);
588            last_y = fragment_y;
589        }
590        if !current_line.is_empty() {
591            lines.push(current_line);
592        }
593
594        // Detect column boundaries
595        let mut column_boundaries = vec![0.0];
596        for line in &lines {
597            if line.len() > 1 {
598                for i in 0..line.len() - 1 {
599                    let gap = line[i + 1].x - (line[i].x + line[i].width);
600                    if gap > self.options.column_threshold {
601                        let boundary = line[i].x + line[i].width + gap / 2.0;
602                        if !column_boundaries
603                            .iter()
604                            .any(|&b| (b - boundary).abs() < 10.0)
605                        {
606                            column_boundaries.push(boundary);
607                        }
608                    }
609                }
610            }
611        }
612        column_boundaries.sort_by(|a, b| a.total_cmp(b));
613
614        // Re-sort fragments by column then Y position
615        if column_boundaries.len() > 1 {
616            fragments.sort_by(|a, b| {
617                // Determine column for each fragment
618                let col_a = column_boundaries
619                    .iter()
620                    .position(|&boundary| a.x < boundary)
621                    .unwrap_or(column_boundaries.len())
622                    - 1;
623                let col_b = column_boundaries
624                    .iter()
625                    .position(|&boundary| b.x < boundary)
626                    .unwrap_or(column_boundaries.len())
627                    - 1;
628
629                if col_a != col_b {
630                    col_a.cmp(&col_b)
631                } else {
632                    // Same column, sort by Y position
633                    b.y.total_cmp(&a.y)
634                }
635            });
636        }
637    }
638
639    /// Reconstruct text from sorted fragments
640    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
641        // First, merge consecutive fragments that are very close together
642        let merged_fragments = self.merge_close_fragments(fragments);
643
644        let mut result = String::new();
645        let mut last_y = f64::INFINITY;
646        let mut last_x = 0.0;
647        let mut last_line_ended_with_hyphen = false;
648
649        for fragment in &merged_fragments {
650            // Check if we need a newline
651            let y_diff = (last_y - fragment.y).abs();
652            if !result.is_empty() && y_diff > self.options.newline_threshold {
653                // Handle hyphenation
654                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
655                    // Remove the hyphen and don't add newline
656                    if result.ends_with('-') {
657                        result.pop();
658                    }
659                } else {
660                    result.push('\n');
661                }
662            } else if !result.is_empty() {
663                // Check if we need a space
664                let x_gap = fragment.x - last_x;
665                if x_gap > self.options.space_threshold * fragment.font_size {
666                    result.push(' ');
667                }
668            }
669
670            result.push_str(&fragment.text);
671            last_line_ended_with_hyphen = fragment.text.ends_with('-');
672            last_y = fragment.y;
673            last_x = fragment.x + fragment.width;
674        }
675
676        result
677    }
678
679    /// Merge fragments that are very close together on the same line
680    /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
681    fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
682        if fragments.is_empty() {
683            return Vec::new();
684        }
685
686        let mut merged = Vec::new();
687        let mut current = fragments[0].clone();
688
689        for fragment in &fragments[1..] {
690            // Check if this fragment is on the same line and very close
691            let y_diff = (current.y - fragment.y).abs();
692            let x_gap = fragment.x - (current.x + current.width);
693
694            // Merge if on same line and gap is less than a character width
695            // Use 0.5 * font_size as threshold - this catches most artificial spacing
696            let should_merge = y_diff < 1.0  // Same line (very tight tolerance)
697                && x_gap >= 0.0  // Fragment is to the right
698                && x_gap < fragment.font_size * 0.5; // Gap less than 50% of font size
699
700            if should_merge {
701                // Merge this fragment into current
702                current.text.push_str(&fragment.text);
703                current.width = (fragment.x + fragment.width) - current.x;
704            } else {
705                // Start a new fragment
706                merged.push(current);
707                current = fragment.clone();
708            }
709        }
710
711        merged.push(current);
712        merged
713    }
714
715    /// Extract font resources from page
716    ///
717    /// Clears the per-page name cache (font names are page-local in PDF), but
718    /// reuses previously parsed font objects via `font_object_cache` to avoid
719    /// re-parsing the same font object across multiple pages.
720    fn extract_font_resources<R: Read + Seek>(
721        &mut self,
722        page: &ParsedPage,
723        document: &PdfDocument<R>,
724    ) -> ParseResult<()> {
725        // Clear per-page name mapping (font names like /F1 are page-local)
726        self.font_cache.clear();
727
728        // Try to get resources manually from page dictionary first
729        // This is necessary because ParsedPage.get_resources() may not always work
730        if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
731            if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
732            {
733                if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
734                    for (font_name, font_obj) in font_dict.0.iter() {
735                        if let Some(font_ref) = font_obj.as_reference() {
736                            self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
737                        }
738                    }
739                }
740            }
741        } else if let Some(resources) = page.get_resources() {
742            // Fallback to get_resources() if Resources is not a reference
743            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
744                for (font_name, font_obj) in font_dict.0.iter() {
745                    if let Some(font_ref) = font_obj.as_reference() {
746                        self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
747                    }
748                }
749            }
750        }
751
752        Ok(())
753    }
754
755    /// Cache a font, reusing the persistent object cache when possible.
756    fn cache_font_by_ref<R: Read + Seek>(
757        &mut self,
758        font_name: &str,
759        font_ref: (u32, u16),
760        document: &PdfDocument<R>,
761    ) {
762        // Check persistent object cache first — avoids re-parsing across pages
763        if let Some(cached) = self.font_object_cache.get(&font_ref) {
764            self.font_cache
765                .insert(font_name.to_string(), cached.clone());
766            tracing::debug!(
767                "Reused cached font object ({}, {}): {} (ToUnicode: {})",
768                font_ref.0,
769                font_ref.1,
770                font_name,
771                cached.to_unicode.is_some()
772            );
773            return;
774        }
775
776        // Parse font object
777        if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
778            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
779            if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
780                let has_to_unicode = font_info.to_unicode.is_some();
781                // Store in persistent cache
782                self.font_object_cache.insert(font_ref, font_info.clone());
783                // Store in per-page name cache
784                self.font_cache.insert(font_name.to_string(), font_info);
785                tracing::debug!(
786                    "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
787                    font_ref.0,
788                    font_ref.1,
789                    font_name,
790                    has_to_unicode
791                );
792            }
793        }
794    }
795
796    /// Decode text using the current font encoding and ToUnicode mapping
797    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
798        use crate::text::encoding::TextEncoding;
799
800        // First, try to use cached font information with ToUnicode CMap
801        if let Some(ref font_name) = state.font_name {
802            if let Some(font_info) = self.font_cache.get(font_name) {
803                // Try CMap-based decoding first (free function — no allocation)
804                if let Ok(decoded) =
805                    crate::text::extraction_cmap::decode_text_with_font(text, font_info)
806                {
807                    // Only accept if we got meaningful text (not all null bytes or garbage)
808                    if !decoded.trim().is_empty()
809                        && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
810                    {
811                        // Apply sanitization to remove control characters (Issue #116)
812                        let sanitized = sanitize_extracted_text(&decoded);
813                        tracing::debug!(
814                            "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
815                            font_name,
816                            text,
817                            sanitized
818                        );
819                        return Ok(sanitized);
820                    }
821                }
822
823                tracing::debug!(
824                    "CMap decoding failed or produced garbage for font {}, falling back to encoding",
825                    font_name
826                );
827            }
828        }
829
830        // Fall back to encoding-based decoding
831        let encoding = if let Some(ref font_name) = state.font_name {
832            match font_name.to_lowercase().as_str() {
833                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
834                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
835                name if name.contains("standard") => TextEncoding::StandardEncoding,
836                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
837                _ => {
838                    // Default based on common patterns
839                    if font_name.starts_with("Times")
840                        || font_name.starts_with("Helvetica")
841                        || font_name.starts_with("Courier")
842                    {
843                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
844                    } else {
845                        TextEncoding::PdfDocEncoding // Safe default
846                    }
847                }
848            }
849        } else {
850            TextEncoding::WinAnsiEncoding // Default for most PDFs
851        };
852
853        let fallback_result = encoding.decode(text);
854        // Apply sanitization to remove control characters (Issue #116)
855        let sanitized = sanitize_extracted_text(&fallback_result);
856        tracing::debug!(
857            "Fallback encoding decoding: {:?} -> \"{}\"",
858            text,
859            sanitized
860        );
861        Ok(sanitized)
862    }
863}
864
865impl Default for TextExtractor {
866    fn default() -> Self {
867        Self::new()
868    }
869}
870
871/// Multiply two transformation matrices
872fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
873    [
874        a[0] * b[0] + a[1] * b[2],
875        a[0] * b[1] + a[1] * b[3],
876        a[2] * b[0] + a[3] * b[2],
877        a[2] * b[1] + a[3] * b[3],
878        a[4] * b[0] + a[5] * b[2] + b[4],
879        a[4] * b[1] + a[5] * b[3] + b[5],
880    ]
881}
882
883/// Transform a point using a transformation matrix
884fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
885    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
886    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
887    (tx, ty)
888}
889
890/// Calculate text width using actual font metrics (including kerning)
891fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
892    // If we have font metrics, use them for accurate width calculation
893    if let Some(font) = font_info {
894        if let Some(ref widths) = font.metrics.widths {
895            let first_char = font.metrics.first_char.unwrap_or(0);
896            let last_char = font.metrics.last_char.unwrap_or(255);
897            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
898
899            let mut total_width = 0.0;
900            let mut chars = text.chars().peekable();
901
902            while let Some(ch) = chars.next() {
903                let char_code = ch as u32;
904
905                // Get width from Widths array or use missing_width
906                let width = if char_code >= first_char && char_code <= last_char {
907                    let index = (char_code - first_char) as usize;
908                    widths.get(index).copied().unwrap_or(missing_width)
909                } else {
910                    missing_width
911                };
912
913                // Convert from glyph space (1/1000 units) to user space
914                total_width += width / 1000.0 * font_size;
915
916                // Apply kerning if available (for character pairs)
917                if let Some(ref kerning) = font.metrics.kerning {
918                    if let Some(&next_ch) = chars.peek() {
919                        let next_char = next_ch as u32;
920                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
921                            // Kerning is in FUnits (1/1000), convert to user space
922                            total_width += kern_value / 1000.0 * font_size;
923                        }
924                    }
925                }
926            }
927
928            return total_width;
929        }
930    }
931
932    // Fallback to simplified calculation if no metrics available
933    text.len() as f64 * font_size * 0.5
934}
935
936/// Sanitize extracted text by removing or replacing control characters.
937///
938/// This function addresses Issue #116 where extracted text contains NUL bytes (`\0`)
939/// and ETX characters (`\u{3}`) where spaces should appear.
940///
941/// # Behavior
942///
943/// - Replaces `\0\u{3}` sequences with a single space (common word separator pattern)
944/// - Replaces standalone `\0` (NUL) with space
945/// - Removes other ASCII control characters (0x01-0x1F) except:
946///   - `\t` (0x09) - Tab
947///   - `\n` (0x0A) - Line feed
948///   - `\r` (0x0D) - Carriage return
949/// - Collapses multiple consecutive spaces into a single space
950///
951/// # Examples
952///
953/// ```
954/// use oxidize_pdf::text::extraction::sanitize_extracted_text;
955///
956/// // Issue #116 pattern: NUL+ETX as word separator
957/// let dirty = "a\0\u{3}sergeant\0\u{3}and";
958/// assert_eq!(sanitize_extracted_text(dirty), "a sergeant and");
959///
960/// // Standalone NUL becomes space
961/// let with_nul = "word\0another";
962/// assert_eq!(sanitize_extracted_text(with_nul), "word another");
963///
964/// // Clean text passes through unchanged
965/// let clean = "Normal text";
966/// assert_eq!(sanitize_extracted_text(clean), "Normal text");
967/// ```
968pub fn sanitize_extracted_text(text: &str) -> String {
969    if text.is_empty() {
970        return String::new();
971    }
972
973    // Pre-allocate with same capacity (result will be <= input length)
974    let mut result = String::with_capacity(text.len());
975    let mut chars = text.chars().peekable();
976    let mut last_was_space = false;
977
978    while let Some(ch) = chars.next() {
979        match ch {
980            // NUL byte - check if followed by ETX for the \0\u{3} pattern
981            '\0' => {
982                // Peek at next char to detect \0\u{3} sequence
983                if chars.peek() == Some(&'\u{3}') {
984                    chars.next(); // consume the ETX
985                }
986                // In both cases (standalone NUL or NUL+ETX), emit space
987                if !last_was_space {
988                    result.push(' ');
989                    last_was_space = true;
990                }
991            }
992
993            // ETX alone (not preceded by NUL) - remove it
994            '\u{3}' => {
995                // Don't emit anything, just skip
996            }
997
998            // Preserve allowed whitespace
999            '\t' | '\n' | '\r' => {
1000                result.push(ch);
1001                // Reset space tracking on newlines but not tabs
1002                last_was_space = ch == '\t';
1003            }
1004
1005            // Regular space - collapse multiples
1006            ' ' => {
1007                if !last_was_space {
1008                    result.push(' ');
1009                    last_was_space = true;
1010                }
1011            }
1012
1013            // Other control characters (0x01-0x1F except tab/newline/CR) - remove
1014            c if c.is_ascii_control() => {
1015                // Skip control characters
1016            }
1017
1018            // Normal characters - keep them
1019            _ => {
1020                result.push(ch);
1021                last_was_space = false;
1022            }
1023        }
1024    }
1025
1026    result
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031    use super::*;
1032
1033    #[test]
1034    fn test_matrix_multiplication() {
1035        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1036        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1037
1038        let result = multiply_matrix(&identity, &translation);
1039        assert_eq!(result, translation);
1040
1041        let result2 = multiply_matrix(&translation, &identity);
1042        assert_eq!(result2, translation);
1043    }
1044
1045    #[test]
1046    fn test_transform_point() {
1047        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1048        let (x, y) = transform_point(5.0, 5.0, &translation);
1049        assert_eq!(x, 15.0);
1050        assert_eq!(y, 25.0);
1051    }
1052
1053    #[test]
1054    fn test_extraction_options_default() {
1055        let options = ExtractionOptions::default();
1056        assert!(!options.preserve_layout);
1057        assert_eq!(options.space_threshold, 0.3);
1058        assert_eq!(options.newline_threshold, 10.0);
1059        assert!(options.sort_by_position);
1060        assert!(!options.detect_columns);
1061        assert_eq!(options.column_threshold, 50.0);
1062        assert!(options.merge_hyphenated);
1063    }
1064
1065    #[test]
1066    fn test_extraction_options_custom() {
1067        let options = ExtractionOptions {
1068            preserve_layout: true,
1069            space_threshold: 0.5,
1070            newline_threshold: 15.0,
1071            sort_by_position: false,
1072            detect_columns: true,
1073            column_threshold: 75.0,
1074            merge_hyphenated: false,
1075            track_space_decisions: false,
1076        };
1077        assert!(options.preserve_layout);
1078        assert_eq!(options.space_threshold, 0.5);
1079        assert_eq!(options.newline_threshold, 15.0);
1080        assert!(!options.sort_by_position);
1081        assert!(options.detect_columns);
1082        assert_eq!(options.column_threshold, 75.0);
1083        assert!(!options.merge_hyphenated);
1084    }
1085
1086    #[test]
1087    fn test_parse_font_style_bold() {
1088        // PostScript style
1089        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1090        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1091
1092        // TrueType style
1093        assert_eq!(parse_font_style("Arial Bold"), (true, false));
1094        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1095
1096        // Short form
1097        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1098    }
1099
1100    #[test]
1101    fn test_parse_font_style_italic() {
1102        // PostScript style
1103        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1104        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1105
1106        // TrueType style
1107        assert_eq!(parse_font_style("Arial Italic"), (false, true));
1108        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1109
1110        // Short form
1111        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1112    }
1113
1114    #[test]
1115    fn test_parse_font_style_bold_italic() {
1116        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1117        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1118        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1119    }
1120
1121    #[test]
1122    fn test_parse_font_style_regular() {
1123        assert_eq!(parse_font_style("Helvetica"), (false, false));
1124        assert_eq!(parse_font_style("Times-Roman"), (false, false));
1125        assert_eq!(parse_font_style("Courier"), (false, false));
1126        assert_eq!(parse_font_style("Arial"), (false, false));
1127    }
1128
1129    #[test]
1130    fn test_parse_font_style_edge_cases() {
1131        // Empty and unusual cases
1132        assert_eq!(parse_font_style(""), (false, false));
1133        assert_eq!(parse_font_style("UnknownFont"), (false, false));
1134
1135        // Case insensitive
1136        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1137        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1138    }
1139
1140    #[test]
1141    fn test_text_fragment() {
1142        let fragment = TextFragment {
1143            text: "Hello".to_string(),
1144            x: 100.0,
1145            y: 200.0,
1146            width: 50.0,
1147            height: 12.0,
1148            font_size: 10.0,
1149            font_name: None,
1150            is_bold: false,
1151            is_italic: false,
1152            color: None,
1153            space_decisions: Vec::new(),
1154        };
1155        assert_eq!(fragment.text, "Hello");
1156        assert_eq!(fragment.x, 100.0);
1157        assert_eq!(fragment.y, 200.0);
1158        assert_eq!(fragment.width, 50.0);
1159        assert_eq!(fragment.height, 12.0);
1160        assert_eq!(fragment.font_size, 10.0);
1161    }
1162
1163    #[test]
1164    fn test_extracted_text() {
1165        let fragments = vec![
1166            TextFragment {
1167                text: "Hello".to_string(),
1168                x: 100.0,
1169                y: 200.0,
1170                width: 50.0,
1171                height: 12.0,
1172                font_size: 10.0,
1173                font_name: None,
1174                is_bold: false,
1175                is_italic: false,
1176                color: None,
1177                space_decisions: Vec::new(),
1178            },
1179            TextFragment {
1180                text: "World".to_string(),
1181                x: 160.0,
1182                y: 200.0,
1183                width: 50.0,
1184                height: 12.0,
1185                font_size: 10.0,
1186                font_name: None,
1187                is_bold: false,
1188                is_italic: false,
1189                color: None,
1190                space_decisions: Vec::new(),
1191            },
1192        ];
1193
1194        let extracted = ExtractedText {
1195            text: "Hello World".to_string(),
1196            fragments: fragments,
1197        };
1198
1199        assert_eq!(extracted.text, "Hello World");
1200        assert_eq!(extracted.fragments.len(), 2);
1201        assert_eq!(extracted.fragments[0].text, "Hello");
1202        assert_eq!(extracted.fragments[1].text, "World");
1203    }
1204
1205    #[test]
1206    fn test_text_state_default() {
1207        let state = TextState::default();
1208        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1209        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1210        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1211        assert_eq!(state.leading, 0.0);
1212        assert_eq!(state.char_space, 0.0);
1213        assert_eq!(state.word_space, 0.0);
1214        assert_eq!(state.horizontal_scale, 100.0);
1215        assert_eq!(state.text_rise, 0.0);
1216        assert_eq!(state.font_size, 0.0);
1217        assert!(state.font_name.is_none());
1218        assert_eq!(state.render_mode, 0);
1219    }
1220
1221    #[test]
1222    fn test_matrix_operations() {
1223        // Test rotation matrix
1224        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
1225        let (x, y) = transform_point(1.0, 0.0, &rotation);
1226        assert_eq!(x, 0.0);
1227        assert_eq!(y, 1.0);
1228
1229        // Test scaling matrix
1230        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1231        let (x, y) = transform_point(5.0, 5.0, &scale);
1232        assert_eq!(x, 10.0);
1233        assert_eq!(y, 15.0);
1234
1235        // Test complex transformation
1236        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1237        let (x, y) = transform_point(1.0, 1.0, &complex);
1238        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
1239        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
1240    }
1241
1242    #[test]
1243    fn test_text_extractor_new() {
1244        let extractor = TextExtractor::new();
1245        let options = extractor.options;
1246        assert!(!options.preserve_layout);
1247        assert_eq!(options.space_threshold, 0.3);
1248        assert_eq!(options.newline_threshold, 10.0);
1249        assert!(options.sort_by_position);
1250        assert!(!options.detect_columns);
1251        assert_eq!(options.column_threshold, 50.0);
1252        assert!(options.merge_hyphenated);
1253    }
1254
1255    #[test]
1256    fn test_text_extractor_with_options() {
1257        let options = ExtractionOptions {
1258            preserve_layout: true,
1259            space_threshold: 0.3,
1260            newline_threshold: 12.0,
1261            sort_by_position: false,
1262            detect_columns: true,
1263            column_threshold: 60.0,
1264            merge_hyphenated: false,
1265            track_space_decisions: false,
1266        };
1267        let extractor = TextExtractor::with_options(options.clone());
1268        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1269        assert_eq!(extractor.options.space_threshold, options.space_threshold);
1270        assert_eq!(
1271            extractor.options.newline_threshold,
1272            options.newline_threshold
1273        );
1274        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1275        assert_eq!(extractor.options.detect_columns, options.detect_columns);
1276        assert_eq!(extractor.options.column_threshold, options.column_threshold);
1277        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1278    }
1279
1280    // =========================================================================
1281    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
1282    // =========================================================================
1283
1284    #[test]
1285    fn test_calculate_text_width_with_no_font_info() {
1286        // Test fallback: should use simplified calculation
1287        let width = calculate_text_width("Hello", 12.0, None);
1288
1289        // Expected: 5 chars * 12.0 * 0.5 = 30.0
1290        assert_eq!(
1291            width, 30.0,
1292            "Without font info, should use simplified calculation: len * font_size * 0.5"
1293        );
1294    }
1295
1296    #[test]
1297    fn test_calculate_text_width_with_empty_metrics() {
1298        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1299
1300        // Font with no widths array
1301        let font_info = FontInfo {
1302            name: "TestFont".to_string(),
1303            font_type: "Type1".to_string(),
1304            encoding: None,
1305            to_unicode: None,
1306            differences: None,
1307            descendant_font: None,
1308            cid_to_gid_map: None,
1309            cid_ordering: None,
1310            metrics: FontMetrics {
1311                first_char: None,
1312                last_char: None,
1313                widths: None,
1314                missing_width: Some(500.0),
1315                kerning: None,
1316            },
1317        };
1318
1319        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1320
1321        // Should fall back to simplified calculation
1322        assert_eq!(
1323            width, 30.0,
1324            "Without widths array, should fall back to simplified calculation"
1325        );
1326    }
1327
1328    #[test]
1329    fn test_calculate_text_width_with_complete_metrics() {
1330        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1331
1332        // Font with complete metrics for ASCII range 32-126
1333        // Simulate typical Helvetica widths (in 1/1000 units)
1334        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
1335
1336        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
1337        widths[72 - 32] = 722.0; // 'H' is ASCII 72
1338        widths[101 - 32] = 556.0; // 'e' is ASCII 101
1339        widths[108 - 32] = 278.0; // 'l' is ASCII 108
1340        widths[111 - 32] = 611.0; // 'o' is ASCII 111
1341
1342        let font_info = FontInfo {
1343            name: "Helvetica".to_string(),
1344            font_type: "Type1".to_string(),
1345            encoding: None,
1346            to_unicode: None,
1347            differences: None,
1348            descendant_font: None,
1349            cid_to_gid_map: None,
1350            cid_ordering: None,
1351            metrics: FontMetrics {
1352                first_char: Some(32),
1353                last_char: Some(126),
1354                widths: Some(widths),
1355                missing_width: Some(500.0),
1356                kerning: None,
1357            },
1358        };
1359
1360        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1361
1362        // Expected calculation (widths in glyph space / 1000 * font_size):
1363        // H: 722/1000 * 12 = 8.664
1364        // e: 556/1000 * 12 = 6.672
1365        // l: 278/1000 * 12 = 3.336
1366        // l: 278/1000 * 12 = 3.336
1367        // o: 611/1000 * 12 = 7.332
1368        // Total: 29.34
1369        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1370        let tolerance = 0.0001; // Floating point tolerance
1371        assert!(
1372            (width - expected).abs() < tolerance,
1373            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1374            expected,
1375            width,
1376            (width - expected).abs()
1377        );
1378
1379        // Verify it's different from simplified calculation
1380        let simplified = 5.0 * 12.0 * 0.5; // 30.0
1381        assert_ne!(
1382            width, simplified,
1383            "Metrics-based calculation should differ from simplified (30.0)"
1384        );
1385    }
1386
1387    #[test]
1388    fn test_calculate_text_width_character_outside_range() {
1389        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1390
1391        // Font with narrow range (only covers 'A'-'Z')
1392        let widths = vec![722.0; 26]; // All uppercase letters same width
1393
1394        let font_info = FontInfo {
1395            name: "TestFont".to_string(),
1396            font_type: "Type1".to_string(),
1397            encoding: None,
1398            to_unicode: None,
1399            differences: None,
1400            descendant_font: None,
1401            cid_to_gid_map: None,
1402            cid_ordering: None,
1403            metrics: FontMetrics {
1404                first_char: Some(65), // 'A'
1405                last_char: Some(90),  // 'Z'
1406                widths: Some(widths),
1407                missing_width: Some(500.0),
1408                kerning: None,
1409            },
1410        };
1411
1412        // Test with character outside range
1413        let width = calculate_text_width("A1", 10.0, Some(&font_info));
1414
1415        // Expected:
1416        // 'A' (65) is in range: 722/1000 * 10 = 7.22
1417        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
1418        // Total: 12.22
1419        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1420        assert_eq!(
1421            width, expected,
1422            "Should use missing_width for characters outside range"
1423        );
1424    }
1425
1426    #[test]
1427    fn test_calculate_text_width_missing_width_in_array() {
1428        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1429
1430        // Font with incomplete widths array (some characters have 0.0)
1431        let mut widths = vec![500.0; 95]; // Default width
1432        widths[10] = 0.0; // Character at index 10 has no width defined
1433
1434        let font_info = FontInfo {
1435            name: "TestFont".to_string(),
1436            font_type: "Type1".to_string(),
1437            encoding: None,
1438            to_unicode: None,
1439            differences: None,
1440            descendant_font: None,
1441            cid_to_gid_map: None,
1442            cid_ordering: None,
1443            metrics: FontMetrics {
1444                first_char: Some(32),
1445                last_char: Some(126),
1446                widths: Some(widths),
1447                missing_width: Some(600.0),
1448                kerning: None,
1449            },
1450        };
1451
1452        // Character 42 (index 10 from first_char 32)
1453        let char_code = 42u8 as char; // '*'
1454        let text = char_code.to_string();
1455        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1456
1457        // Character is in range but width is 0.0, should NOT fall back to missing_width
1458        // (0.0 is a valid width for zero-width characters)
1459        assert_eq!(
1460            width, 0.0,
1461            "Should use 0.0 width from array, not missing_width"
1462        );
1463    }
1464
1465    #[test]
1466    fn test_calculate_text_width_empty_string() {
1467        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1468
1469        let font_info = FontInfo {
1470            name: "TestFont".to_string(),
1471            font_type: "Type1".to_string(),
1472            encoding: None,
1473            to_unicode: None,
1474            differences: None,
1475            descendant_font: None,
1476            cid_to_gid_map: None,
1477            cid_ordering: None,
1478            metrics: FontMetrics {
1479                first_char: Some(32),
1480                last_char: Some(126),
1481                widths: Some(vec![500.0; 95]),
1482                missing_width: Some(500.0),
1483                kerning: None,
1484            },
1485        };
1486
1487        let width = calculate_text_width("", 12.0, Some(&font_info));
1488        assert_eq!(width, 0.0, "Empty string should have zero width");
1489
1490        // Also test without font info
1491        let width_no_font = calculate_text_width("", 12.0, None);
1492        assert_eq!(
1493            width_no_font, 0.0,
1494            "Empty string should have zero width (no font)"
1495        );
1496    }
1497
1498    #[test]
1499    fn test_calculate_text_width_unicode_characters() {
1500        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1501
1502        // Font with limited ASCII range
1503        let font_info = FontInfo {
1504            name: "TestFont".to_string(),
1505            font_type: "Type1".to_string(),
1506            encoding: None,
1507            to_unicode: None,
1508            differences: None,
1509            descendant_font: None,
1510            cid_to_gid_map: None,
1511            cid_ordering: None,
1512            metrics: FontMetrics {
1513                first_char: Some(32),
1514                last_char: Some(126),
1515                widths: Some(vec![500.0; 95]),
1516                missing_width: Some(600.0),
1517                kerning: None,
1518            },
1519        };
1520
1521        // Test with Unicode characters outside ASCII range
1522        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1523
1524        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1525        // Expected: 600/1000 * 10 = 6.0
1526        assert_eq!(
1527            width, 6.0,
1528            "Unicode character outside range should use missing_width"
1529        );
1530    }
1531
1532    #[test]
1533    fn test_calculate_text_width_different_font_sizes() {
1534        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1535
1536        let font_info = FontInfo {
1537            name: "TestFont".to_string(),
1538            font_type: "Type1".to_string(),
1539            encoding: None,
1540            to_unicode: None,
1541            differences: None,
1542            descendant_font: None,
1543            cid_to_gid_map: None,
1544            cid_ordering: None,
1545            metrics: FontMetrics {
1546                first_char: Some(65), // 'A'
1547                last_char: Some(65),  // 'A'
1548                widths: Some(vec![722.0]),
1549                missing_width: Some(500.0),
1550                kerning: None,
1551            },
1552        };
1553
1554        // Test same character with different font sizes
1555        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1556        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1557
1558        // Widths should scale linearly with font size
1559        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1560        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1561        assert_eq!(
1562            width_20,
1563            width_10 * 2.0,
1564            "Width should scale linearly with font size"
1565        );
1566    }
1567
1568    #[test]
1569    fn test_calculate_text_width_proportional_vs_monospace() {
1570        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1571
1572        // Simulate proportional font (different widths)
1573        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1574        let proportional_font = FontInfo {
1575            name: "Helvetica".to_string(),
1576            font_type: "Type1".to_string(),
1577            encoding: None,
1578            to_unicode: None,
1579            differences: None,
1580            descendant_font: None,
1581            cid_to_gid_map: None,
1582            cid_ordering: None,
1583            metrics: FontMetrics {
1584                first_char: Some(105), // 'i'
1585                last_char: Some(107),  // covers i, j, k
1586                widths: Some(proportional_widths),
1587                missing_width: Some(500.0),
1588                kerning: None,
1589            },
1590        };
1591
1592        // Simulate monospace font (same width)
1593        let monospace_widths = vec![600.0, 600.0, 600.0];
1594        let monospace_font = FontInfo {
1595            name: "Courier".to_string(),
1596            font_type: "Type1".to_string(),
1597            encoding: None,
1598            to_unicode: None,
1599            differences: None,
1600            descendant_font: None,
1601            cid_to_gid_map: None,
1602            cid_ordering: None,
1603            metrics: FontMetrics {
1604                first_char: Some(105),
1605                last_char: Some(107),
1606                widths: Some(monospace_widths),
1607                missing_width: Some(600.0),
1608                kerning: None,
1609            },
1610        };
1611
1612        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1613        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1614
1615        // Proportional 'i' should be narrower than monospace 'i'
1616        assert!(
1617            prop_width < mono_width,
1618            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1619            prop_width,
1620            mono_width
1621        );
1622    }
1623
1624    // =========================================================================
1625    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1626    // =========================================================================
1627
1628    #[test]
1629    fn test_calculate_text_width_with_kerning() {
1630        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1631        use std::collections::HashMap;
1632
1633        // Create a font with kerning pairs
1634        let mut widths = vec![500.0; 95]; // ASCII 32-126
1635        widths[65 - 32] = 722.0; // 'A'
1636        widths[86 - 32] = 722.0; // 'V'
1637        widths[87 - 32] = 944.0; // 'W'
1638
1639        let mut kerning = HashMap::new();
1640        // Typical kerning pairs (in FUnits, 1/1000)
1641        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1642        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1643
1644        let font_info = FontInfo {
1645            name: "Helvetica".to_string(),
1646            font_type: "Type1".to_string(),
1647            encoding: None,
1648            to_unicode: None,
1649            differences: None,
1650            descendant_font: None,
1651            cid_to_gid_map: None,
1652            cid_ordering: None,
1653            metrics: FontMetrics {
1654                first_char: Some(32),
1655                last_char: Some(126),
1656                widths: Some(widths),
1657                missing_width: Some(500.0),
1658                kerning: Some(kerning),
1659            },
1660        };
1661
1662        // Test "AV" with kerning
1663        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1664        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1665        //         = 17.328 - 0.6 = 16.728
1666        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1667        let tolerance = 0.0001;
1668        assert!(
1669            (width_av - expected_av).abs() < tolerance,
1670            "AV with kerning: expected {}, got {}, diff {}",
1671            expected_av,
1672            width_av,
1673            (width_av - expected_av).abs()
1674        );
1675
1676        // Test "AW" with different kerning value
1677        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1678        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1679        //         = 19.992 - 0.48 = 19.512
1680        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1681        assert!(
1682            (width_aw - expected_aw).abs() < tolerance,
1683            "AW with kerning: expected {}, got {}, diff {}",
1684            expected_aw,
1685            width_aw,
1686            (width_aw - expected_aw).abs()
1687        );
1688
1689        // Test "VA" with NO kerning (pair not in HashMap)
1690        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1691        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1692        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1693        assert!(
1694            (width_va - expected_va).abs() < tolerance,
1695            "VA without kerning: expected {}, got {}, diff {}",
1696            expected_va,
1697            width_va,
1698            (width_va - expected_va).abs()
1699        );
1700
1701        // Verify kerning makes a measurable difference
1702        assert!(
1703            width_av < width_va,
1704            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1705            width_av,
1706            width_va
1707        );
1708    }
1709
1710    #[test]
1711    fn test_parse_truetype_kern_table_minimal() {
1712        use crate::text::extraction_cmap::parse_truetype_kern_table;
1713
1714        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1715        // Structure:
1716        // 1. Offset table (12 bytes)
1717        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1718        // 3. 'head' table data (54 bytes)
1719        // 4. 'kern' table data (30 bytes)
1720        // Total: 128 bytes
1721        let mut ttf_data = vec![
1722            // Offset table
1723            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1724            0x00, 0x02, // numTables: 2
1725            0x00, 0x20, // searchRange: 32
1726            0x00, 0x01, // entrySelector: 1
1727            0x00, 0x00, // rangeShift: 0
1728        ];
1729
1730        // Table directory entry 1: 'head' table
1731        ttf_data.extend_from_slice(b"head"); // tag
1732        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1733        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1734        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1735
1736        // Table directory entry 2: 'kern' table
1737        ttf_data.extend_from_slice(b"kern"); // tag
1738        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1739        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1740        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1741
1742        // 'head' table data (54 bytes of zeros - minimal valid head table)
1743        ttf_data.extend_from_slice(&[0u8; 54]);
1744
1745        // 'kern' table data (34 bytes)
1746        ttf_data.extend_from_slice(&[
1747            // Kern table header
1748            0x00, 0x00, // version: 0
1749            0x00, 0x01, // nTables: 1
1750            // Subtable header
1751            0x00, 0x00, // version: 0
1752            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1753            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1754            0x00, 0x02, // nPairs: 2
1755            0x00, 0x08, // searchRange: 8
1756            0x00, 0x00, // entrySelector: 0
1757            0x00, 0x04, // rangeShift: 4
1758            // Kerning pair 1: A + V → -50
1759            0x00, 0x41, // left glyph: 65 ('A')
1760            0x00, 0x56, // right glyph: 86 ('V')
1761            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1762            // Kerning pair 2: A + W → -40
1763            0x00, 0x41, // left glyph: 65 ('A')
1764            0x00, 0x57, // right glyph: 87 ('W')
1765            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1766        ]);
1767
1768        let result = parse_truetype_kern_table(&ttf_data);
1769        assert!(
1770            result.is_ok(),
1771            "Should parse minimal kern table successfully: {:?}",
1772            result.err()
1773        );
1774
1775        let kerning_map = result.unwrap();
1776        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1777
1778        // Verify pair 1: A + V → -50
1779        assert_eq!(
1780            kerning_map.get(&(65, 86)),
1781            Some(&-50.0),
1782            "Should have A+V kerning pair with value -50"
1783        );
1784
1785        // Verify pair 2: A + W → -40
1786        assert_eq!(
1787            kerning_map.get(&(65, 87)),
1788            Some(&-40.0),
1789            "Should have A+W kerning pair with value -40"
1790        );
1791    }
1792
1793    #[test]
1794    fn test_parse_kern_table_no_kern_table() {
1795        use crate::text::extraction_cmap::extract_truetype_kerning;
1796
1797        // TrueType font data WITHOUT a 'kern' table
1798        // Structure:
1799        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1800        // - Table directory: 1 entry for 'head' table (not 'kern')
1801        let ttf_data = vec![
1802            // Offset table
1803            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1804            0x00, 0x01, // numTables: 1
1805            0x00, 0x10, // searchRange: 16
1806            0x00, 0x00, // entrySelector: 0
1807            0x00, 0x00, // rangeShift: 0
1808            // Table directory entry: 'head' table (not 'kern')
1809            b'h', b'e', b'a', b'd', // tag: 'head'
1810            0x00, 0x00, 0x00, 0x00, // checksum
1811            0x00, 0x00, 0x00, 0x1C, // offset: 28
1812            0x00, 0x00, 0x00, 0x36, // length: 54
1813            // Mock 'head' table data (54 bytes of zeros)
1814            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1815            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1816            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1817            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1818        ];
1819
1820        let result = extract_truetype_kerning(&ttf_data);
1821        assert!(
1822            result.is_ok(),
1823            "Should gracefully handle missing kern table"
1824        );
1825
1826        let kerning_map = result.unwrap();
1827        assert!(
1828            kerning_map.is_empty(),
1829            "Should return empty HashMap when no kern table exists"
1830        );
1831    }
1832}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs