oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19    /// Preserve the original layout (spacing and positioning)
20    pub preserve_layout: bool,
21    /// Minimum space width to insert space character (in text space units)
22    pub space_threshold: f64,
23    /// Minimum vertical distance to insert newline (in text space units)
24    pub newline_threshold: f64,
25    /// Sort text fragments by position (useful for multi-column layouts)
26    pub sort_by_position: bool,
27    /// Detect and handle columns
28    pub detect_columns: bool,
29    /// Column separation threshold (in page units)
30    pub column_threshold: f64,
31    /// Merge hyphenated words at line ends
32    pub merge_hyphenated: bool,
33}
34
35impl Default for ExtractionOptions {
36    fn default() -> Self {
37        Self {
38            preserve_layout: false,
39            space_threshold: 0.3,
40            newline_threshold: 10.0,
41            sort_by_position: true,
42            detect_columns: false,
43            column_threshold: 50.0,
44            merge_hyphenated: true,
45        }
46    }
47}
48
49/// Extracted text with position information
50#[derive(Debug, Clone)]
51pub struct ExtractedText {
52    /// The extracted text content
53    pub text: String,
54    /// Text fragments with position information (if preserve_layout is true)
55    pub fragments: Vec<TextFragment>,
56}
57
58/// A fragment of text with position information
59#[derive(Debug, Clone)]
60pub struct TextFragment {
61    /// Text content
62    pub text: String,
63    /// X position in page coordinates
64    pub x: f64,
65    /// Y position in page coordinates
66    pub y: f64,
67    /// Width of the text
68    pub width: f64,
69    /// Height of the text
70    pub height: f64,
71    /// Font size
72    pub font_size: f64,
73    /// Font name (if known) - used for kerning-aware text spacing
74    pub font_name: Option<String>,
75    /// Whether the font is bold (detected from font name)
76    pub is_bold: bool,
77    /// Whether the font is italic (detected from font name)
78    pub is_italic: bool,
79    /// Fill color of the text (from graphics state)
80    pub color: Option<Color>,
81}
82
83/// Text extraction state
84struct TextState {
85    /// Current text matrix
86    text_matrix: [f64; 6],
87    /// Current text line matrix
88    text_line_matrix: [f64; 6],
89    /// Current transformation matrix (CTM)
90    ctm: [f64; 6],
91    /// Text leading (line spacing)
92    leading: f64,
93    /// Character spacing
94    char_space: f64,
95    /// Word spacing
96    word_space: f64,
97    /// Horizontal scaling
98    horizontal_scale: f64,
99    /// Text rise
100    text_rise: f64,
101    /// Current font size
102    font_size: f64,
103    /// Current font name
104    font_name: Option<String>,
105    /// Render mode (0 = fill, 1 = stroke, etc.)
106    render_mode: u8,
107    /// Fill color (for text rendering)
108    fill_color: Option<Color>,
109}
110
111impl Default for TextState {
112    fn default() -> Self {
113        Self {
114            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
115            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
116            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
117            leading: 0.0,
118            char_space: 0.0,
119            word_space: 0.0,
120            horizontal_scale: 100.0,
121            text_rise: 0.0,
122            font_size: 0.0,
123            font_name: None,
124            render_mode: 0,
125            fill_color: None,
126        }
127    }
128}
129
130/// Parse font style (bold/italic) from font name
131///
132/// Detects bold and italic styles from common font naming patterns.
133/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
134/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
135///
136/// # Examples
137///
138/// ```
139/// use oxidize_pdf::text::extraction::parse_font_style;
140///
141/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
142/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
143/// assert_eq!(parse_font_style("Courier"), (false, false));
144/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
145/// ```
146///
147/// # Returns
148///
149/// Tuple of (is_bold, is_italic)
150pub fn parse_font_style(font_name: &str) -> (bool, bool) {
151    let name_lower = font_name.to_lowercase();
152
153    // Detect bold from common patterns
154    let is_bold = name_lower.contains("bold")
155        || name_lower.contains("-b")
156        || name_lower.contains(" b ")
157        || name_lower.ends_with(" b");
158
159    // Detect italic/oblique from common patterns
160    let is_italic = name_lower.contains("italic")
161        || name_lower.contains("oblique")
162        || name_lower.contains("-i")
163        || name_lower.contains(" i ")
164        || name_lower.ends_with(" i");
165
166    (is_bold, is_italic)
167}
168
169/// Text extractor for PDF pages with CMap support
170pub struct TextExtractor {
171    options: ExtractionOptions,
172    /// Font cache for the current extraction
173    font_cache: HashMap<String, FontInfo>,
174}
175
176impl TextExtractor {
177    /// Create a new text extractor with default options
178    pub fn new() -> Self {
179        Self {
180            options: ExtractionOptions::default(),
181            font_cache: HashMap::new(),
182        }
183    }
184
185    /// Create a text extractor with custom options
186    pub fn with_options(options: ExtractionOptions) -> Self {
187        Self {
188            options,
189            font_cache: HashMap::new(),
190        }
191    }
192
193    /// Extract text from a PDF document
194    pub fn extract_from_document<R: Read + Seek>(
195        &mut self,
196        document: &PdfDocument<R>,
197    ) -> ParseResult<Vec<ExtractedText>> {
198        let page_count = document.page_count()?;
199        let mut results = Vec::new();
200
201        for i in 0..page_count {
202            let text = self.extract_from_page(document, i)?;
203            results.push(text);
204        }
205
206        Ok(results)
207    }
208
209    /// Extract text from a specific page
210    pub fn extract_from_page<R: Read + Seek>(
211        &mut self,
212        document: &PdfDocument<R>,
213        page_index: u32,
214    ) -> ParseResult<ExtractedText> {
215        // Get the page
216        let page = document.get_page(page_index)?;
217
218        // Extract font resources first
219        self.extract_font_resources(&page, document)?;
220
221        // Get content streams
222        let streams = page.content_streams_with_document(document)?;
223
224        let mut extracted_text = String::new();
225        let mut fragments = Vec::new();
226        let mut state = TextState::default();
227        let mut in_text_object = false;
228        let mut last_x = 0.0;
229        let mut last_y = 0.0;
230
231        // Process each content stream
232        for (stream_idx, stream_data) in streams.iter().enumerate() {
233            let operations = match ContentParser::parse_content(stream_data) {
234                Ok(ops) => ops,
235                Err(e) => {
236                    // Enhanced diagnostic logging for content stream parsing failures
237                    tracing::debug!(
238                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
239                        page_index + 1,
240                        stream_idx + 1,
241                        streams.len()
242                    );
243                    tracing::debug!("         Error: {}", e);
244                    tracing::debug!("         Stream size: {} bytes", stream_data.len());
245
246                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
247                    let preview_len = stream_data.len().min(100);
248                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
249                    tracing::debug!(
250                        "         Stream preview (first {} bytes): {:?}",
251                        preview_len,
252                        preview.chars().take(80).collect::<String>()
253                    );
254
255                    // Continue processing other streams
256                    continue;
257                }
258            };
259
260            for op in operations {
261                match op {
262                    ContentOperation::BeginText => {
263                        in_text_object = true;
264                        // Reset text matrix to identity
265                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
266                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
267                    }
268
269                    ContentOperation::EndText => {
270                        in_text_object = false;
271                    }
272
273                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
274                        state.text_matrix =
275                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
276                        state.text_line_matrix =
277                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
278                    }
279
280                    ContentOperation::MoveText(tx, ty) => {
281                        // Update text matrix by translation
282                        let new_matrix = multiply_matrix(
283                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
284                            &state.text_line_matrix,
285                        );
286                        state.text_matrix = new_matrix;
287                        state.text_line_matrix = new_matrix;
288                    }
289
290                    ContentOperation::NextLine => {
291                        // Move to next line using current leading
292                        let new_matrix = multiply_matrix(
293                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
294                            &state.text_line_matrix,
295                        );
296                        state.text_matrix = new_matrix;
297                        state.text_line_matrix = new_matrix;
298                    }
299
300                    ContentOperation::ShowText(text) => {
301                        if in_text_object {
302                            let text_bytes = &text;
303                            let decoded = self.decode_text(text_bytes, &state)?;
304
305                            // Calculate position: Apply text_matrix, then CTM
306                            // Concatenate: final = CTM × text_matrix
307                            let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
308                            let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
309
310                            // Add spacing based on position change
311                            if !extracted_text.is_empty() {
312                                let dx = x - last_x;
313                                let dy = (y - last_y).abs();
314
315                                if dy > self.options.newline_threshold {
316                                    extracted_text.push('\n');
317                                } else if dx > self.options.space_threshold * state.font_size {
318                                    extracted_text.push(' ');
319                                }
320                            }
321
322                            extracted_text.push_str(&decoded);
323
324                            // Get font info for accurate width calculation
325                            let font_info = state
326                                .font_name
327                                .as_ref()
328                                .and_then(|name| self.font_cache.get(name));
329
330                            if self.options.preserve_layout {
331                                // Detect bold/italic from font name
332                                let (is_bold, is_italic) = state
333                                    .font_name
334                                    .as_ref()
335                                    .map(|name| parse_font_style(name))
336                                    .unwrap_or((false, false));
337
338                                fragments.push(TextFragment {
339                                    text: decoded.clone(),
340                                    x,
341                                    y,
342                                    width: calculate_text_width(
343                                        &decoded,
344                                        state.font_size,
345                                        font_info,
346                                    ),
347                                    height: state.font_size,
348                                    font_size: state.font_size,
349                                    font_name: state.font_name.clone(),
350                                    is_bold,
351                                    is_italic,
352                                    color: state.fill_color,
353                                });
354                            }
355
356                            // Update position for next text
357                            last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
358                            last_y = y;
359
360                            // Update text matrix for next show operation
361                            let text_width =
362                                calculate_text_width(&decoded, state.font_size, font_info);
363                            let tx = text_width * state.horizontal_scale / 100.0;
364                            state.text_matrix =
365                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
366                        }
367                    }
368
369                    ContentOperation::ShowTextArray(array) => {
370                        if in_text_object {
371                            // Get font info for accurate width calculation
372                            let font_info = state
373                                .font_name
374                                .as_ref()
375                                .and_then(|name| self.font_cache.get(name));
376
377                            for item in array {
378                                match item {
379                                    TextElement::Text(text_bytes) => {
380                                        let decoded = self.decode_text(&text_bytes, &state)?;
381                                        extracted_text.push_str(&decoded);
382
383                                        // Update text matrix
384                                        let text_width = calculate_text_width(
385                                            &decoded,
386                                            state.font_size,
387                                            font_info,
388                                        );
389                                        let tx = text_width * state.horizontal_scale / 100.0;
390                                        state.text_matrix = multiply_matrix(
391                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
392                                            &state.text_matrix,
393                                        );
394                                    }
395                                    TextElement::Spacing(adjustment) => {
396                                        // Text position adjustment (negative = move left)
397                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
398                                        state.text_matrix = multiply_matrix(
399                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
400                                            &state.text_matrix,
401                                        );
402                                    }
403                                }
404                            }
405                        }
406                    }
407
408                    ContentOperation::SetFont(name, size) => {
409                        state.font_name = Some(name);
410                        state.font_size = size as f64;
411                    }
412
413                    ContentOperation::SetLeading(leading) => {
414                        state.leading = leading as f64;
415                    }
416
417                    ContentOperation::SetCharSpacing(spacing) => {
418                        state.char_space = spacing as f64;
419                    }
420
421                    ContentOperation::SetWordSpacing(spacing) => {
422                        state.word_space = spacing as f64;
423                    }
424
425                    ContentOperation::SetHorizontalScaling(scale) => {
426                        state.horizontal_scale = scale as f64;
427                    }
428
429                    ContentOperation::SetTextRise(rise) => {
430                        state.text_rise = rise as f64;
431                    }
432
433                    ContentOperation::SetTextRenderMode(mode) => {
434                        state.render_mode = mode as u8;
435                    }
436
437                    ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
438                        // Update CTM: new_ctm = concat_matrix * current_ctm
439                        let [a0, b0, c0, d0, e0, f0] = state.ctm;
440                        let a = a as f64;
441                        let b = b as f64;
442                        let c = c as f64;
443                        let d = d as f64;
444                        let e = e as f64;
445                        let f = f as f64;
446                        state.ctm = [
447                            a * a0 + b * c0,
448                            a * b0 + b * d0,
449                            c * a0 + d * c0,
450                            c * b0 + d * d0,
451                            e * a0 + f * c0 + e0,
452                            e * b0 + f * d0 + f0,
453                        ];
454                    }
455
456                    // Color operations (Phase 4: Color extraction)
457                    ContentOperation::SetNonStrokingGray(gray) => {
458                        state.fill_color = Some(Color::gray(gray as f64));
459                    }
460
461                    ContentOperation::SetNonStrokingRGB(r, g, b) => {
462                        state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
463                    }
464
465                    ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
466                        state.fill_color =
467                            Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
468                    }
469
470                    _ => {
471                        // Other operations don't affect text extraction
472                    }
473                }
474            }
475        }
476
477        // Sort and process fragments if requested
478        if self.options.sort_by_position && !fragments.is_empty() {
479            self.sort_and_merge_fragments(&mut fragments);
480        }
481
482        // Merge close fragments to eliminate spacing artifacts
483        // This is crucial for table detection and structured data extraction
484        if self.options.preserve_layout && !fragments.is_empty() {
485            fragments = self.merge_close_fragments(&fragments);
486        }
487
488        // Reconstruct text from sorted fragments if layout is preserved
489        if self.options.preserve_layout && !fragments.is_empty() {
490            extracted_text = self.reconstruct_text_from_fragments(&fragments);
491        }
492
493        Ok(ExtractedText {
494            text: extracted_text,
495            fragments,
496        })
497    }
498
499    /// Sort text fragments by position and merge them appropriately
500    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
501        // Sort fragments by Y position (top to bottom) then X position (left to right).
502        //
503        // We quantize Y into bands of `newline_threshold` width so that fragments
504        // on the "same line" get identical Y keys. This ensures the comparator is
505        // a strict total order (transitive), which Rust's sort algorithm requires.
506        // Without quantization, threshold-based "same line" detection breaks
507        // transitivity: A≈B and B≈C does NOT imply A≈C.
508        let threshold = self.options.newline_threshold;
509        fragments.sort_by(|a, b| {
510            // Quantize Y to nearest band (PDF Y increases upward, so negate first)
511            let band_a = if threshold > 0.0 {
512                (-a.y / threshold).round()
513            } else {
514                -a.y
515            };
516            let band_b = if threshold > 0.0 {
517                (-b.y / threshold).round()
518            } else {
519                -b.y
520            };
521
522            // Compare by Y band (top to bottom), then by X within same band
523            band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
524        });
525
526        // Detect columns if requested
527        if self.options.detect_columns {
528            self.detect_and_sort_columns(fragments);
529        }
530    }
531
532    /// Detect columns and re-sort fragments accordingly
533    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
534        // Group fragments by approximate Y position
535        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
536        let mut current_line: Vec<&mut TextFragment> = Vec::new();
537        let mut last_y = f64::INFINITY;
538
539        for fragment in fragments.iter_mut() {
540            let fragment_y = fragment.y;
541            if (last_y - fragment_y).abs() > self.options.newline_threshold
542                && !current_line.is_empty()
543            {
544                lines.push(current_line);
545                current_line = Vec::new();
546            }
547            current_line.push(fragment);
548            last_y = fragment_y;
549        }
550        if !current_line.is_empty() {
551            lines.push(current_line);
552        }
553
554        // Detect column boundaries
555        let mut column_boundaries = vec![0.0];
556        for line in &lines {
557            if line.len() > 1 {
558                for i in 0..line.len() - 1 {
559                    let gap = line[i + 1].x - (line[i].x + line[i].width);
560                    if gap > self.options.column_threshold {
561                        let boundary = line[i].x + line[i].width + gap / 2.0;
562                        if !column_boundaries
563                            .iter()
564                            .any(|&b| (b - boundary).abs() < 10.0)
565                        {
566                            column_boundaries.push(boundary);
567                        }
568                    }
569                }
570            }
571        }
572        column_boundaries.sort_by(|a, b| a.total_cmp(b));
573
574        // Re-sort fragments by column then Y position
575        if column_boundaries.len() > 1 {
576            fragments.sort_by(|a, b| {
577                // Determine column for each fragment
578                let col_a = column_boundaries
579                    .iter()
580                    .position(|&boundary| a.x < boundary)
581                    .unwrap_or(column_boundaries.len())
582                    - 1;
583                let col_b = column_boundaries
584                    .iter()
585                    .position(|&boundary| b.x < boundary)
586                    .unwrap_or(column_boundaries.len())
587                    - 1;
588
589                if col_a != col_b {
590                    col_a.cmp(&col_b)
591                } else {
592                    // Same column, sort by Y position
593                    b.y.total_cmp(&a.y)
594                }
595            });
596        }
597    }
598
599    /// Reconstruct text from sorted fragments
600    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
601        // First, merge consecutive fragments that are very close together
602        let merged_fragments = self.merge_close_fragments(fragments);
603
604        let mut result = String::new();
605        let mut last_y = f64::INFINITY;
606        let mut last_x = 0.0;
607        let mut last_line_ended_with_hyphen = false;
608
609        for fragment in &merged_fragments {
610            // Check if we need a newline
611            let y_diff = (last_y - fragment.y).abs();
612            if !result.is_empty() && y_diff > self.options.newline_threshold {
613                // Handle hyphenation
614                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
615                    // Remove the hyphen and don't add newline
616                    if result.ends_with('-') {
617                        result.pop();
618                    }
619                } else {
620                    result.push('\n');
621                }
622            } else if !result.is_empty() {
623                // Check if we need a space
624                let x_gap = fragment.x - last_x;
625                if x_gap > self.options.space_threshold * fragment.font_size {
626                    result.push(' ');
627                }
628            }
629
630            result.push_str(&fragment.text);
631            last_line_ended_with_hyphen = fragment.text.ends_with('-');
632            last_y = fragment.y;
633            last_x = fragment.x + fragment.width;
634        }
635
636        result
637    }
638
639    /// Merge fragments that are very close together on the same line
640    /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
641    fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
642        if fragments.is_empty() {
643            return Vec::new();
644        }
645
646        let mut merged = Vec::new();
647        let mut current = fragments[0].clone();
648
649        for fragment in &fragments[1..] {
650            // Check if this fragment is on the same line and very close
651            let y_diff = (current.y - fragment.y).abs();
652            let x_gap = fragment.x - (current.x + current.width);
653
654            // Merge if on same line and gap is less than a character width
655            // Use 0.5 * font_size as threshold - this catches most artificial spacing
656            let should_merge = y_diff < 1.0  // Same line (very tight tolerance)
657                && x_gap >= 0.0  // Fragment is to the right
658                && x_gap < fragment.font_size * 0.5; // Gap less than 50% of font size
659
660            if should_merge {
661                // Merge this fragment into current
662                current.text.push_str(&fragment.text);
663                current.width = (fragment.x + fragment.width) - current.x;
664            } else {
665                // Start a new fragment
666                merged.push(current);
667                current = fragment.clone();
668            }
669        }
670
671        merged.push(current);
672        merged
673    }
674
675    /// Extract font resources from page
676    fn extract_font_resources<R: Read + Seek>(
677        &mut self,
678        page: &ParsedPage,
679        document: &PdfDocument<R>,
680    ) -> ParseResult<()> {
681        // Clear previous font cache
682        self.font_cache.clear();
683
684        // Try to get resources manually from page dictionary first
685        // This is necessary because ParsedPage.get_resources() may not always work
686        if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
687            if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
688            {
689                if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
690                    // Extract each font
691                    for (font_name, font_obj) in font_dict.0.iter() {
692                        if let Some(font_ref) = font_obj.as_reference() {
693                            if let Ok(PdfObject::Dictionary(font_dict)) =
694                                document.get_object(font_ref.0, font_ref.1)
695                            {
696                                // Create a CMap extractor to use its font extraction logic
697                                let mut cmap_extractor: CMapTextExtractor<R> =
698                                    CMapTextExtractor::new();
699
700                                if let Ok(font_info) =
701                                    cmap_extractor.extract_font_info(&font_dict, document)
702                                {
703                                    let has_to_unicode = font_info.to_unicode.is_some();
704                                    self.font_cache.insert(font_name.0.clone(), font_info);
705                                    tracing::debug!(
706                                        "Cached font: {} (ToUnicode: {})",
707                                        font_name.0,
708                                        has_to_unicode
709                                    );
710                                }
711                            }
712                        }
713                    }
714                }
715            }
716        } else if let Some(resources) = page.get_resources() {
717            // Fallback to get_resources() if Resources is not a reference
718            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
719                for (font_name, font_obj) in font_dict.0.iter() {
720                    if let Some(font_ref) = font_obj.as_reference() {
721                        if let Ok(PdfObject::Dictionary(font_dict)) =
722                            document.get_object(font_ref.0, font_ref.1)
723                        {
724                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
725
726                            if let Ok(font_info) =
727                                cmap_extractor.extract_font_info(&font_dict, document)
728                            {
729                                let has_to_unicode = font_info.to_unicode.is_some();
730                                self.font_cache.insert(font_name.0.clone(), font_info);
731                                tracing::debug!(
732                                    "Cached font: {} (ToUnicode: {})",
733                                    font_name.0,
734                                    has_to_unicode
735                                );
736                            }
737                        }
738                    }
739                }
740            }
741        }
742
743        Ok(())
744    }
745
746    /// Decode text using the current font encoding and ToUnicode mapping
747    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
748        use crate::text::encoding::TextEncoding;
749
750        // First, try to use cached font information with ToUnicode CMap
751        if let Some(ref font_name) = state.font_name {
752            if let Some(font_info) = self.font_cache.get(font_name) {
753                // Create a temporary CMapTextExtractor to use its decoding logic
754                let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
755
756                // Try CMap-based decoding first
757                if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
758                    // Only accept if we got meaningful text (not all null bytes or garbage)
759                    if !decoded.trim().is_empty()
760                        && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
761                    {
762                        // Apply sanitization to remove control characters (Issue #116)
763                        let sanitized = sanitize_extracted_text(&decoded);
764                        tracing::debug!(
765                            "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
766                            font_name,
767                            text,
768                            sanitized
769                        );
770                        return Ok(sanitized);
771                    }
772                }
773
774                tracing::debug!(
775                    "CMap decoding failed or produced garbage for font {}, falling back to encoding",
776                    font_name
777                );
778            }
779        }
780
781        // Fall back to encoding-based decoding
782        let encoding = if let Some(ref font_name) = state.font_name {
783            match font_name.to_lowercase().as_str() {
784                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
785                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
786                name if name.contains("standard") => TextEncoding::StandardEncoding,
787                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
788                _ => {
789                    // Default based on common patterns
790                    if font_name.starts_with("Times")
791                        || font_name.starts_with("Helvetica")
792                        || font_name.starts_with("Courier")
793                    {
794                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
795                    } else {
796                        TextEncoding::PdfDocEncoding // Safe default
797                    }
798                }
799            }
800        } else {
801            TextEncoding::WinAnsiEncoding // Default for most PDFs
802        };
803
804        let fallback_result = encoding.decode(text);
805        // Apply sanitization to remove control characters (Issue #116)
806        let sanitized = sanitize_extracted_text(&fallback_result);
807        tracing::debug!(
808            "Fallback encoding decoding: {:?} -> \"{}\"",
809            text,
810            sanitized
811        );
812        Ok(sanitized)
813    }
814}
815
816impl Default for TextExtractor {
817    fn default() -> Self {
818        Self::new()
819    }
820}
821
822/// Multiply two transformation matrices
823fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
824    [
825        a[0] * b[0] + a[1] * b[2],
826        a[0] * b[1] + a[1] * b[3],
827        a[2] * b[0] + a[3] * b[2],
828        a[2] * b[1] + a[3] * b[3],
829        a[4] * b[0] + a[5] * b[2] + b[4],
830        a[4] * b[1] + a[5] * b[3] + b[5],
831    ]
832}
833
834/// Transform a point using a transformation matrix
835fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
836    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
837    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
838    (tx, ty)
839}
840
841/// Calculate text width using actual font metrics (including kerning)
842fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
843    // If we have font metrics, use them for accurate width calculation
844    if let Some(font) = font_info {
845        if let Some(ref widths) = font.metrics.widths {
846            let first_char = font.metrics.first_char.unwrap_or(0);
847            let last_char = font.metrics.last_char.unwrap_or(255);
848            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
849
850            let mut total_width = 0.0;
851            let chars: Vec<char> = text.chars().collect();
852
853            for (i, &ch) in chars.iter().enumerate() {
854                let char_code = ch as u32;
855
856                // Get width from Widths array or use missing_width
857                let width = if char_code >= first_char && char_code <= last_char {
858                    let index = (char_code - first_char) as usize;
859                    widths.get(index).copied().unwrap_or(missing_width)
860                } else {
861                    missing_width
862                };
863
864                // Convert from glyph space (1/1000 units) to user space
865                total_width += width / 1000.0 * font_size;
866
867                // Apply kerning if available (for character pairs)
868                if let Some(ref kerning) = font.metrics.kerning {
869                    if i + 1 < chars.len() {
870                        let next_char = chars[i + 1] as u32;
871                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
872                            // Kerning is in FUnits (1/1000), convert to user space
873                            total_width += kern_value / 1000.0 * font_size;
874                        }
875                    }
876                }
877            }
878
879            return total_width;
880        }
881    }
882
883    // Fallback to simplified calculation if no metrics available
884    text.len() as f64 * font_size * 0.5
885}
886
887/// Sanitize extracted text by removing or replacing control characters.
888///
889/// This function addresses Issue #116 where extracted text contains NUL bytes (`\0`)
890/// and ETX characters (`\u{3}`) where spaces should appear.
891///
892/// # Behavior
893///
894/// - Replaces `\0\u{3}` sequences with a single space (common word separator pattern)
895/// - Replaces standalone `\0` (NUL) with space
896/// - Removes other ASCII control characters (0x01-0x1F) except:
897///   - `\t` (0x09) - Tab
898///   - `\n` (0x0A) - Line feed
899///   - `\r` (0x0D) - Carriage return
900/// - Collapses multiple consecutive spaces into a single space
901///
902/// # Examples
903///
904/// ```
905/// use oxidize_pdf::text::extraction::sanitize_extracted_text;
906///
907/// // Issue #116 pattern: NUL+ETX as word separator
908/// let dirty = "a\0\u{3}sergeant\0\u{3}and";
909/// assert_eq!(sanitize_extracted_text(dirty), "a sergeant and");
910///
911/// // Standalone NUL becomes space
912/// let with_nul = "word\0another";
913/// assert_eq!(sanitize_extracted_text(with_nul), "word another");
914///
915/// // Clean text passes through unchanged
916/// let clean = "Normal text";
917/// assert_eq!(sanitize_extracted_text(clean), "Normal text");
918/// ```
919pub fn sanitize_extracted_text(text: &str) -> String {
920    if text.is_empty() {
921        return String::new();
922    }
923
924    // Pre-allocate with same capacity (result will be <= input length)
925    let mut result = String::with_capacity(text.len());
926    let mut chars = text.chars().peekable();
927    let mut last_was_space = false;
928
929    while let Some(ch) = chars.next() {
930        match ch {
931            // NUL byte - check if followed by ETX for the \0\u{3} pattern
932            '\0' => {
933                // Peek at next char to detect \0\u{3} sequence
934                if chars.peek() == Some(&'\u{3}') {
935                    chars.next(); // consume the ETX
936                }
937                // In both cases (standalone NUL or NUL+ETX), emit space
938                if !last_was_space {
939                    result.push(' ');
940                    last_was_space = true;
941                }
942            }
943
944            // ETX alone (not preceded by NUL) - remove it
945            '\u{3}' => {
946                // Don't emit anything, just skip
947            }
948
949            // Preserve allowed whitespace
950            '\t' | '\n' | '\r' => {
951                result.push(ch);
952                // Reset space tracking on newlines but not tabs
953                last_was_space = ch == '\t';
954            }
955
956            // Regular space - collapse multiples
957            ' ' => {
958                if !last_was_space {
959                    result.push(' ');
960                    last_was_space = true;
961                }
962            }
963
964            // Other control characters (0x01-0x1F except tab/newline/CR) - remove
965            c if c.is_ascii_control() => {
966                // Skip control characters
967            }
968
969            // Normal characters - keep them
970            _ => {
971                result.push(ch);
972                last_was_space = false;
973            }
974        }
975    }
976
977    result
978}
979
980#[cfg(test)]
981mod tests {
982    use super::*;
983
984    #[test]
985    fn test_matrix_multiplication() {
986        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
987        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
988
989        let result = multiply_matrix(&identity, &translation);
990        assert_eq!(result, translation);
991
992        let result2 = multiply_matrix(&translation, &identity);
993        assert_eq!(result2, translation);
994    }
995
996    #[test]
997    fn test_transform_point() {
998        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
999        let (x, y) = transform_point(5.0, 5.0, &translation);
1000        assert_eq!(x, 15.0);
1001        assert_eq!(y, 25.0);
1002    }
1003
1004    #[test]
1005    fn test_extraction_options_default() {
1006        let options = ExtractionOptions::default();
1007        assert!(!options.preserve_layout);
1008        assert_eq!(options.space_threshold, 0.3);
1009        assert_eq!(options.newline_threshold, 10.0);
1010        assert!(options.sort_by_position);
1011        assert!(!options.detect_columns);
1012        assert_eq!(options.column_threshold, 50.0);
1013        assert!(options.merge_hyphenated);
1014    }
1015
1016    #[test]
1017    fn test_extraction_options_custom() {
1018        let options = ExtractionOptions {
1019            preserve_layout: true,
1020            space_threshold: 0.5,
1021            newline_threshold: 15.0,
1022            sort_by_position: false,
1023            detect_columns: true,
1024            column_threshold: 75.0,
1025            merge_hyphenated: false,
1026        };
1027        assert!(options.preserve_layout);
1028        assert_eq!(options.space_threshold, 0.5);
1029        assert_eq!(options.newline_threshold, 15.0);
1030        assert!(!options.sort_by_position);
1031        assert!(options.detect_columns);
1032        assert_eq!(options.column_threshold, 75.0);
1033        assert!(!options.merge_hyphenated);
1034    }
1035
1036    #[test]
1037    fn test_parse_font_style_bold() {
1038        // PostScript style
1039        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1040        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1041
1042        // TrueType style
1043        assert_eq!(parse_font_style("Arial Bold"), (true, false));
1044        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1045
1046        // Short form
1047        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1048    }
1049
1050    #[test]
1051    fn test_parse_font_style_italic() {
1052        // PostScript style
1053        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1054        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1055
1056        // TrueType style
1057        assert_eq!(parse_font_style("Arial Italic"), (false, true));
1058        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1059
1060        // Short form
1061        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1062    }
1063
1064    #[test]
1065    fn test_parse_font_style_bold_italic() {
1066        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1067        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1068        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1069    }
1070
1071    #[test]
1072    fn test_parse_font_style_regular() {
1073        assert_eq!(parse_font_style("Helvetica"), (false, false));
1074        assert_eq!(parse_font_style("Times-Roman"), (false, false));
1075        assert_eq!(parse_font_style("Courier"), (false, false));
1076        assert_eq!(parse_font_style("Arial"), (false, false));
1077    }
1078
1079    #[test]
1080    fn test_parse_font_style_edge_cases() {
1081        // Empty and unusual cases
1082        assert_eq!(parse_font_style(""), (false, false));
1083        assert_eq!(parse_font_style("UnknownFont"), (false, false));
1084
1085        // Case insensitive
1086        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1087        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1088    }
1089
1090    #[test]
1091    fn test_text_fragment() {
1092        let fragment = TextFragment {
1093            text: "Hello".to_string(),
1094            x: 100.0,
1095            y: 200.0,
1096            width: 50.0,
1097            height: 12.0,
1098            font_size: 10.0,
1099            font_name: None,
1100            is_bold: false,
1101            is_italic: false,
1102            color: None,
1103        };
1104        assert_eq!(fragment.text, "Hello");
1105        assert_eq!(fragment.x, 100.0);
1106        assert_eq!(fragment.y, 200.0);
1107        assert_eq!(fragment.width, 50.0);
1108        assert_eq!(fragment.height, 12.0);
1109        assert_eq!(fragment.font_size, 10.0);
1110    }
1111
1112    #[test]
1113    fn test_extracted_text() {
1114        let fragments = vec![
1115            TextFragment {
1116                text: "Hello".to_string(),
1117                x: 100.0,
1118                y: 200.0,
1119                width: 50.0,
1120                height: 12.0,
1121                font_size: 10.0,
1122                font_name: None,
1123                is_bold: false,
1124                is_italic: false,
1125                color: None,
1126            },
1127            TextFragment {
1128                text: "World".to_string(),
1129                x: 160.0,
1130                y: 200.0,
1131                width: 50.0,
1132                height: 12.0,
1133                font_size: 10.0,
1134                font_name: None,
1135                is_bold: false,
1136                is_italic: false,
1137                color: None,
1138            },
1139        ];
1140
1141        let extracted = ExtractedText {
1142            text: "Hello World".to_string(),
1143            fragments: fragments,
1144        };
1145
1146        assert_eq!(extracted.text, "Hello World");
1147        assert_eq!(extracted.fragments.len(), 2);
1148        assert_eq!(extracted.fragments[0].text, "Hello");
1149        assert_eq!(extracted.fragments[1].text, "World");
1150    }
1151
1152    #[test]
1153    fn test_text_state_default() {
1154        let state = TextState::default();
1155        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1156        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1157        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1158        assert_eq!(state.leading, 0.0);
1159        assert_eq!(state.char_space, 0.0);
1160        assert_eq!(state.word_space, 0.0);
1161        assert_eq!(state.horizontal_scale, 100.0);
1162        assert_eq!(state.text_rise, 0.0);
1163        assert_eq!(state.font_size, 0.0);
1164        assert!(state.font_name.is_none());
1165        assert_eq!(state.render_mode, 0);
1166    }
1167
1168    #[test]
1169    fn test_matrix_operations() {
1170        // Test rotation matrix
1171        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
1172        let (x, y) = transform_point(1.0, 0.0, &rotation);
1173        assert_eq!(x, 0.0);
1174        assert_eq!(y, 1.0);
1175
1176        // Test scaling matrix
1177        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1178        let (x, y) = transform_point(5.0, 5.0, &scale);
1179        assert_eq!(x, 10.0);
1180        assert_eq!(y, 15.0);
1181
1182        // Test complex transformation
1183        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1184        let (x, y) = transform_point(1.0, 1.0, &complex);
1185        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
1186        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
1187    }
1188
1189    #[test]
1190    fn test_text_extractor_new() {
1191        let extractor = TextExtractor::new();
1192        let options = extractor.options;
1193        assert!(!options.preserve_layout);
1194        assert_eq!(options.space_threshold, 0.3);
1195        assert_eq!(options.newline_threshold, 10.0);
1196        assert!(options.sort_by_position);
1197        assert!(!options.detect_columns);
1198        assert_eq!(options.column_threshold, 50.0);
1199        assert!(options.merge_hyphenated);
1200    }
1201
1202    #[test]
1203    fn test_text_extractor_with_options() {
1204        let options = ExtractionOptions {
1205            preserve_layout: true,
1206            space_threshold: 0.3,
1207            newline_threshold: 12.0,
1208            sort_by_position: false,
1209            detect_columns: true,
1210            column_threshold: 60.0,
1211            merge_hyphenated: false,
1212        };
1213        let extractor = TextExtractor::with_options(options.clone());
1214        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1215        assert_eq!(extractor.options.space_threshold, options.space_threshold);
1216        assert_eq!(
1217            extractor.options.newline_threshold,
1218            options.newline_threshold
1219        );
1220        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1221        assert_eq!(extractor.options.detect_columns, options.detect_columns);
1222        assert_eq!(extractor.options.column_threshold, options.column_threshold);
1223        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1224    }
1225
1226    // =========================================================================
1227    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
1228    // =========================================================================
1229
1230    #[test]
1231    fn test_calculate_text_width_with_no_font_info() {
1232        // Test fallback: should use simplified calculation
1233        let width = calculate_text_width("Hello", 12.0, None);
1234
1235        // Expected: 5 chars * 12.0 * 0.5 = 30.0
1236        assert_eq!(
1237            width, 30.0,
1238            "Without font info, should use simplified calculation: len * font_size * 0.5"
1239        );
1240    }
1241
1242    #[test]
1243    fn test_calculate_text_width_with_empty_metrics() {
1244        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1245
1246        // Font with no widths array
1247        let font_info = FontInfo {
1248            name: "TestFont".to_string(),
1249            font_type: "Type1".to_string(),
1250            encoding: None,
1251            to_unicode: None,
1252            differences: None,
1253            descendant_font: None,
1254            cid_to_gid_map: None,
1255            metrics: FontMetrics {
1256                first_char: None,
1257                last_char: None,
1258                widths: None,
1259                missing_width: Some(500.0),
1260                kerning: None,
1261            },
1262        };
1263
1264        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1265
1266        // Should fall back to simplified calculation
1267        assert_eq!(
1268            width, 30.0,
1269            "Without widths array, should fall back to simplified calculation"
1270        );
1271    }
1272
1273    #[test]
1274    fn test_calculate_text_width_with_complete_metrics() {
1275        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1276
1277        // Font with complete metrics for ASCII range 32-126
1278        // Simulate typical Helvetica widths (in 1/1000 units)
1279        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
1280
1281        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
1282        widths[72 - 32] = 722.0; // 'H' is ASCII 72
1283        widths[101 - 32] = 556.0; // 'e' is ASCII 101
1284        widths[108 - 32] = 278.0; // 'l' is ASCII 108
1285        widths[111 - 32] = 611.0; // 'o' is ASCII 111
1286
1287        let font_info = FontInfo {
1288            name: "Helvetica".to_string(),
1289            font_type: "Type1".to_string(),
1290            encoding: None,
1291            to_unicode: None,
1292            differences: None,
1293            descendant_font: None,
1294            cid_to_gid_map: None,
1295            metrics: FontMetrics {
1296                first_char: Some(32),
1297                last_char: Some(126),
1298                widths: Some(widths),
1299                missing_width: Some(500.0),
1300                kerning: None,
1301            },
1302        };
1303
1304        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1305
1306        // Expected calculation (widths in glyph space / 1000 * font_size):
1307        // H: 722/1000 * 12 = 8.664
1308        // e: 556/1000 * 12 = 6.672
1309        // l: 278/1000 * 12 = 3.336
1310        // l: 278/1000 * 12 = 3.336
1311        // o: 611/1000 * 12 = 7.332
1312        // Total: 29.34
1313        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1314        let tolerance = 0.0001; // Floating point tolerance
1315        assert!(
1316            (width - expected).abs() < tolerance,
1317            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1318            expected,
1319            width,
1320            (width - expected).abs()
1321        );
1322
1323        // Verify it's different from simplified calculation
1324        let simplified = 5.0 * 12.0 * 0.5; // 30.0
1325        assert_ne!(
1326            width, simplified,
1327            "Metrics-based calculation should differ from simplified (30.0)"
1328        );
1329    }
1330
1331    #[test]
1332    fn test_calculate_text_width_character_outside_range() {
1333        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1334
1335        // Font with narrow range (only covers 'A'-'Z')
1336        let widths = vec![722.0; 26]; // All uppercase letters same width
1337
1338        let font_info = FontInfo {
1339            name: "TestFont".to_string(),
1340            font_type: "Type1".to_string(),
1341            encoding: None,
1342            to_unicode: None,
1343            differences: None,
1344            descendant_font: None,
1345            cid_to_gid_map: None,
1346            metrics: FontMetrics {
1347                first_char: Some(65), // 'A'
1348                last_char: Some(90),  // 'Z'
1349                widths: Some(widths),
1350                missing_width: Some(500.0),
1351                kerning: None,
1352            },
1353        };
1354
1355        // Test with character outside range
1356        let width = calculate_text_width("A1", 10.0, Some(&font_info));
1357
1358        // Expected:
1359        // 'A' (65) is in range: 722/1000 * 10 = 7.22
1360        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
1361        // Total: 12.22
1362        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1363        assert_eq!(
1364            width, expected,
1365            "Should use missing_width for characters outside range"
1366        );
1367    }
1368
1369    #[test]
1370    fn test_calculate_text_width_missing_width_in_array() {
1371        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1372
1373        // Font with incomplete widths array (some characters have 0.0)
1374        let mut widths = vec![500.0; 95]; // Default width
1375        widths[10] = 0.0; // Character at index 10 has no width defined
1376
1377        let font_info = FontInfo {
1378            name: "TestFont".to_string(),
1379            font_type: "Type1".to_string(),
1380            encoding: None,
1381            to_unicode: None,
1382            differences: None,
1383            descendant_font: None,
1384            cid_to_gid_map: None,
1385            metrics: FontMetrics {
1386                first_char: Some(32),
1387                last_char: Some(126),
1388                widths: Some(widths),
1389                missing_width: Some(600.0),
1390                kerning: None,
1391            },
1392        };
1393
1394        // Character 42 (index 10 from first_char 32)
1395        let char_code = 42u8 as char; // '*'
1396        let text = char_code.to_string();
1397        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1398
1399        // Character is in range but width is 0.0, should NOT fall back to missing_width
1400        // (0.0 is a valid width for zero-width characters)
1401        assert_eq!(
1402            width, 0.0,
1403            "Should use 0.0 width from array, not missing_width"
1404        );
1405    }
1406
1407    #[test]
1408    fn test_calculate_text_width_empty_string() {
1409        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1410
1411        let font_info = FontInfo {
1412            name: "TestFont".to_string(),
1413            font_type: "Type1".to_string(),
1414            encoding: None,
1415            to_unicode: None,
1416            differences: None,
1417            descendant_font: None,
1418            cid_to_gid_map: None,
1419            metrics: FontMetrics {
1420                first_char: Some(32),
1421                last_char: Some(126),
1422                widths: Some(vec![500.0; 95]),
1423                missing_width: Some(500.0),
1424                kerning: None,
1425            },
1426        };
1427
1428        let width = calculate_text_width("", 12.0, Some(&font_info));
1429        assert_eq!(width, 0.0, "Empty string should have zero width");
1430
1431        // Also test without font info
1432        let width_no_font = calculate_text_width("", 12.0, None);
1433        assert_eq!(
1434            width_no_font, 0.0,
1435            "Empty string should have zero width (no font)"
1436        );
1437    }
1438
1439    #[test]
1440    fn test_calculate_text_width_unicode_characters() {
1441        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1442
1443        // Font with limited ASCII range
1444        let font_info = FontInfo {
1445            name: "TestFont".to_string(),
1446            font_type: "Type1".to_string(),
1447            encoding: None,
1448            to_unicode: None,
1449            differences: None,
1450            descendant_font: None,
1451            cid_to_gid_map: None,
1452            metrics: FontMetrics {
1453                first_char: Some(32),
1454                last_char: Some(126),
1455                widths: Some(vec![500.0; 95]),
1456                missing_width: Some(600.0),
1457                kerning: None,
1458            },
1459        };
1460
1461        // Test with Unicode characters outside ASCII range
1462        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1463
1464        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1465        // Expected: 600/1000 * 10 = 6.0
1466        assert_eq!(
1467            width, 6.0,
1468            "Unicode character outside range should use missing_width"
1469        );
1470    }
1471
1472    #[test]
1473    fn test_calculate_text_width_different_font_sizes() {
1474        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1475
1476        let font_info = FontInfo {
1477            name: "TestFont".to_string(),
1478            font_type: "Type1".to_string(),
1479            encoding: None,
1480            to_unicode: None,
1481            differences: None,
1482            descendant_font: None,
1483            cid_to_gid_map: None,
1484            metrics: FontMetrics {
1485                first_char: Some(65), // 'A'
1486                last_char: Some(65),  // 'A'
1487                widths: Some(vec![722.0]),
1488                missing_width: Some(500.0),
1489                kerning: None,
1490            },
1491        };
1492
1493        // Test same character with different font sizes
1494        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1495        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1496
1497        // Widths should scale linearly with font size
1498        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1499        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1500        assert_eq!(
1501            width_20,
1502            width_10 * 2.0,
1503            "Width should scale linearly with font size"
1504        );
1505    }
1506
1507    #[test]
1508    fn test_calculate_text_width_proportional_vs_monospace() {
1509        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1510
1511        // Simulate proportional font (different widths)
1512        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1513        let proportional_font = FontInfo {
1514            name: "Helvetica".to_string(),
1515            font_type: "Type1".to_string(),
1516            encoding: None,
1517            to_unicode: None,
1518            differences: None,
1519            descendant_font: None,
1520            cid_to_gid_map: None,
1521            metrics: FontMetrics {
1522                first_char: Some(105), // 'i'
1523                last_char: Some(107),  // covers i, j, k
1524                widths: Some(proportional_widths),
1525                missing_width: Some(500.0),
1526                kerning: None,
1527            },
1528        };
1529
1530        // Simulate monospace font (same width)
1531        let monospace_widths = vec![600.0, 600.0, 600.0];
1532        let monospace_font = FontInfo {
1533            name: "Courier".to_string(),
1534            font_type: "Type1".to_string(),
1535            encoding: None,
1536            to_unicode: None,
1537            differences: None,
1538            descendant_font: None,
1539            cid_to_gid_map: None,
1540            metrics: FontMetrics {
1541                first_char: Some(105),
1542                last_char: Some(107),
1543                widths: Some(monospace_widths),
1544                missing_width: Some(600.0),
1545                kerning: None,
1546            },
1547        };
1548
1549        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1550        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1551
1552        // Proportional 'i' should be narrower than monospace 'i'
1553        assert!(
1554            prop_width < mono_width,
1555            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1556            prop_width,
1557            mono_width
1558        );
1559    }
1560
1561    // =========================================================================
1562    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1563    // =========================================================================
1564
1565    #[test]
1566    fn test_calculate_text_width_with_kerning() {
1567        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1568        use std::collections::HashMap;
1569
1570        // Create a font with kerning pairs
1571        let mut widths = vec![500.0; 95]; // ASCII 32-126
1572        widths[65 - 32] = 722.0; // 'A'
1573        widths[86 - 32] = 722.0; // 'V'
1574        widths[87 - 32] = 944.0; // 'W'
1575
1576        let mut kerning = HashMap::new();
1577        // Typical kerning pairs (in FUnits, 1/1000)
1578        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1579        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1580
1581        let font_info = FontInfo {
1582            name: "Helvetica".to_string(),
1583            font_type: "Type1".to_string(),
1584            encoding: None,
1585            to_unicode: None,
1586            differences: None,
1587            descendant_font: None,
1588            cid_to_gid_map: None,
1589            metrics: FontMetrics {
1590                first_char: Some(32),
1591                last_char: Some(126),
1592                widths: Some(widths),
1593                missing_width: Some(500.0),
1594                kerning: Some(kerning),
1595            },
1596        };
1597
1598        // Test "AV" with kerning
1599        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1600        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1601        //         = 17.328 - 0.6 = 16.728
1602        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1603        let tolerance = 0.0001;
1604        assert!(
1605            (width_av - expected_av).abs() < tolerance,
1606            "AV with kerning: expected {}, got {}, diff {}",
1607            expected_av,
1608            width_av,
1609            (width_av - expected_av).abs()
1610        );
1611
1612        // Test "AW" with different kerning value
1613        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1614        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1615        //         = 19.992 - 0.48 = 19.512
1616        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1617        assert!(
1618            (width_aw - expected_aw).abs() < tolerance,
1619            "AW with kerning: expected {}, got {}, diff {}",
1620            expected_aw,
1621            width_aw,
1622            (width_aw - expected_aw).abs()
1623        );
1624
1625        // Test "VA" with NO kerning (pair not in HashMap)
1626        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1627        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1628        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1629        assert!(
1630            (width_va - expected_va).abs() < tolerance,
1631            "VA without kerning: expected {}, got {}, diff {}",
1632            expected_va,
1633            width_va,
1634            (width_va - expected_va).abs()
1635        );
1636
1637        // Verify kerning makes a measurable difference
1638        assert!(
1639            width_av < width_va,
1640            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1641            width_av,
1642            width_va
1643        );
1644    }
1645
1646    #[test]
1647    fn test_parse_truetype_kern_table_minimal() {
1648        use crate::text::extraction_cmap::parse_truetype_kern_table;
1649
1650        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1651        // Structure:
1652        // 1. Offset table (12 bytes)
1653        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1654        // 3. 'head' table data (54 bytes)
1655        // 4. 'kern' table data (30 bytes)
1656        // Total: 128 bytes
1657        let mut ttf_data = vec![
1658            // Offset table
1659            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1660            0x00, 0x02, // numTables: 2
1661            0x00, 0x20, // searchRange: 32
1662            0x00, 0x01, // entrySelector: 1
1663            0x00, 0x00, // rangeShift: 0
1664        ];
1665
1666        // Table directory entry 1: 'head' table
1667        ttf_data.extend_from_slice(b"head"); // tag
1668        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1669        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1670        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1671
1672        // Table directory entry 2: 'kern' table
1673        ttf_data.extend_from_slice(b"kern"); // tag
1674        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1675        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1676        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1677
1678        // 'head' table data (54 bytes of zeros - minimal valid head table)
1679        ttf_data.extend_from_slice(&[0u8; 54]);
1680
1681        // 'kern' table data (34 bytes)
1682        ttf_data.extend_from_slice(&[
1683            // Kern table header
1684            0x00, 0x00, // version: 0
1685            0x00, 0x01, // nTables: 1
1686            // Subtable header
1687            0x00, 0x00, // version: 0
1688            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1689            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1690            0x00, 0x02, // nPairs: 2
1691            0x00, 0x08, // searchRange: 8
1692            0x00, 0x00, // entrySelector: 0
1693            0x00, 0x04, // rangeShift: 4
1694            // Kerning pair 1: A + V → -50
1695            0x00, 0x41, // left glyph: 65 ('A')
1696            0x00, 0x56, // right glyph: 86 ('V')
1697            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1698            // Kerning pair 2: A + W → -40
1699            0x00, 0x41, // left glyph: 65 ('A')
1700            0x00, 0x57, // right glyph: 87 ('W')
1701            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1702        ]);
1703
1704        let result = parse_truetype_kern_table(&ttf_data);
1705        assert!(
1706            result.is_ok(),
1707            "Should parse minimal kern table successfully: {:?}",
1708            result.err()
1709        );
1710
1711        let kerning_map = result.unwrap();
1712        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1713
1714        // Verify pair 1: A + V → -50
1715        assert_eq!(
1716            kerning_map.get(&(65, 86)),
1717            Some(&-50.0),
1718            "Should have A+V kerning pair with value -50"
1719        );
1720
1721        // Verify pair 2: A + W → -40
1722        assert_eq!(
1723            kerning_map.get(&(65, 87)),
1724            Some(&-40.0),
1725            "Should have A+W kerning pair with value -40"
1726        );
1727    }
1728
1729    #[test]
1730    fn test_parse_kern_table_no_kern_table() {
1731        use crate::text::extraction_cmap::extract_truetype_kerning;
1732
1733        // TrueType font data WITHOUT a 'kern' table
1734        // Structure:
1735        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1736        // - Table directory: 1 entry for 'head' table (not 'kern')
1737        let ttf_data = vec![
1738            // Offset table
1739            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1740            0x00, 0x01, // numTables: 1
1741            0x00, 0x10, // searchRange: 16
1742            0x00, 0x00, // entrySelector: 0
1743            0x00, 0x00, // rangeShift: 0
1744            // Table directory entry: 'head' table (not 'kern')
1745            b'h', b'e', b'a', b'd', // tag: 'head'
1746            0x00, 0x00, 0x00, 0x00, // checksum
1747            0x00, 0x00, 0x00, 0x1C, // offset: 28
1748            0x00, 0x00, 0x00, 0x36, // length: 54
1749            // Mock 'head' table data (54 bytes of zeros)
1750            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1751            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1752            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1753            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1754        ];
1755
1756        let result = extract_truetype_kerning(&ttf_data);
1757        assert!(
1758            result.is_ok(),
1759            "Should gracefully handle missing kern table"
1760        );
1761
1762        let kerning_map = result.unwrap();
1763        assert!(
1764            kerning_map.is_empty(),
1765            "Should return empty HashMap when no kern table exists"
1766        );
1767    }
1768}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs