oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19    /// Preserve the original layout (spacing and positioning)
20    pub preserve_layout: bool,
21    /// Minimum space width to insert space character (in text space units)
22    pub space_threshold: f64,
23    /// Minimum vertical distance to insert newline (in text space units)
24    pub newline_threshold: f64,
25    /// Sort text fragments by position (useful for multi-column layouts)
26    pub sort_by_position: bool,
27    /// Detect and handle columns
28    pub detect_columns: bool,
29    /// Column separation threshold (in page units)
30    pub column_threshold: f64,
31    /// Merge hyphenated words at line ends
32    pub merge_hyphenated: bool,
33}
34
35impl Default for ExtractionOptions {
36    fn default() -> Self {
37        Self {
38            preserve_layout: false,
39            space_threshold: 0.2,
40            newline_threshold: 10.0,
41            sort_by_position: true,
42            detect_columns: false,
43            column_threshold: 50.0,
44            merge_hyphenated: true,
45        }
46    }
47}
48
49/// Extracted text with position information
50#[derive(Debug, Clone)]
51pub struct ExtractedText {
52    /// The extracted text content
53    pub text: String,
54    /// Text fragments with position information (if preserve_layout is true)
55    pub fragments: Vec<TextFragment>,
56}
57
58/// A fragment of text with position information
59#[derive(Debug, Clone)]
60pub struct TextFragment {
61    /// Text content
62    pub text: String,
63    /// X position in page coordinates
64    pub x: f64,
65    /// Y position in page coordinates
66    pub y: f64,
67    /// Width of the text
68    pub width: f64,
69    /// Height of the text
70    pub height: f64,
71    /// Font size
72    pub font_size: f64,
73    /// Font name (if known) - used for kerning-aware text spacing
74    pub font_name: Option<String>,
75    /// Whether the font is bold (detected from font name)
76    pub is_bold: bool,
77    /// Whether the font is italic (detected from font name)
78    pub is_italic: bool,
79    /// Fill color of the text (from graphics state)
80    pub color: Option<Color>,
81}
82
83/// Text extraction state
84struct TextState {
85    /// Current text matrix
86    text_matrix: [f64; 6],
87    /// Current text line matrix
88    text_line_matrix: [f64; 6],
89    /// Current transformation matrix (CTM)
90    ctm: [f64; 6],
91    /// Text leading (line spacing)
92    leading: f64,
93    /// Character spacing
94    char_space: f64,
95    /// Word spacing
96    word_space: f64,
97    /// Horizontal scaling
98    horizontal_scale: f64,
99    /// Text rise
100    text_rise: f64,
101    /// Current font size
102    font_size: f64,
103    /// Current font name
104    font_name: Option<String>,
105    /// Render mode (0 = fill, 1 = stroke, etc.)
106    render_mode: u8,
107    /// Fill color (for text rendering)
108    fill_color: Option<Color>,
109}
110
111impl Default for TextState {
112    fn default() -> Self {
113        Self {
114            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
115            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
116            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
117            leading: 0.0,
118            char_space: 0.0,
119            word_space: 0.0,
120            horizontal_scale: 100.0,
121            text_rise: 0.0,
122            font_size: 0.0,
123            font_name: None,
124            render_mode: 0,
125            fill_color: None,
126        }
127    }
128}
129
130/// Parse font style (bold/italic) from font name
131///
132/// Detects bold and italic styles from common font naming patterns.
133/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
134/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
135///
136/// # Examples
137///
138/// ```
139/// use oxidize_pdf::text::extraction::parse_font_style;
140///
141/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
142/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
143/// assert_eq!(parse_font_style("Courier"), (false, false));
144/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
145/// ```
146///
147/// # Returns
148///
149/// Tuple of (is_bold, is_italic)
150pub fn parse_font_style(font_name: &str) -> (bool, bool) {
151    let name_lower = font_name.to_lowercase();
152
153    // Detect bold from common patterns
154    let is_bold = name_lower.contains("bold")
155        || name_lower.contains("-b")
156        || name_lower.contains(" b ")
157        || name_lower.ends_with(" b");
158
159    // Detect italic/oblique from common patterns
160    let is_italic = name_lower.contains("italic")
161        || name_lower.contains("oblique")
162        || name_lower.contains("-i")
163        || name_lower.contains(" i ")
164        || name_lower.ends_with(" i");
165
166    (is_bold, is_italic)
167}
168
169/// Text extractor for PDF pages with CMap support
170pub struct TextExtractor {
171    options: ExtractionOptions,
172    /// Font cache for the current extraction
173    font_cache: HashMap<String, FontInfo>,
174}
175
176impl TextExtractor {
177    /// Create a new text extractor with default options
178    pub fn new() -> Self {
179        Self {
180            options: ExtractionOptions::default(),
181            font_cache: HashMap::new(),
182        }
183    }
184
185    /// Create a text extractor with custom options
186    pub fn with_options(options: ExtractionOptions) -> Self {
187        Self {
188            options,
189            font_cache: HashMap::new(),
190        }
191    }
192
193    /// Extract text from a PDF document
194    pub fn extract_from_document<R: Read + Seek>(
195        &mut self,
196        document: &PdfDocument<R>,
197    ) -> ParseResult<Vec<ExtractedText>> {
198        let page_count = document.page_count()?;
199        let mut results = Vec::new();
200
201        for i in 0..page_count {
202            let text = self.extract_from_page(document, i)?;
203            results.push(text);
204        }
205
206        Ok(results)
207    }
208
209    /// Extract text from a specific page
210    pub fn extract_from_page<R: Read + Seek>(
211        &mut self,
212        document: &PdfDocument<R>,
213        page_index: u32,
214    ) -> ParseResult<ExtractedText> {
215        // Get the page
216        let page = document.get_page(page_index)?;
217
218        // Extract font resources first
219        self.extract_font_resources(&page, document)?;
220
221        // Get content streams
222        let streams = page.content_streams_with_document(document)?;
223
224        let mut extracted_text = String::new();
225        let mut fragments = Vec::new();
226        let mut state = TextState::default();
227        let mut in_text_object = false;
228        let mut last_x = 0.0;
229        let mut last_y = 0.0;
230
231        // Process each content stream
232        for (stream_idx, stream_data) in streams.iter().enumerate() {
233            let operations = match ContentParser::parse_content(stream_data) {
234                Ok(ops) => ops,
235                Err(e) => {
236                    // Enhanced diagnostic logging for content stream parsing failures
237                    eprintln!(
238                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
239                        page_index + 1,
240                        stream_idx + 1,
241                        streams.len()
242                    );
243                    eprintln!("         Error: {}", e);
244                    eprintln!("         Stream size: {} bytes", stream_data.len());
245
246                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
247                    let preview_len = stream_data.len().min(100);
248                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
249                    eprintln!(
250                        "         Stream preview (first {} bytes): {:?}",
251                        preview_len,
252                        preview.chars().take(80).collect::<String>()
253                    );
254
255                    // Continue processing other streams
256                    continue;
257                }
258            };
259
260            for op in operations {
261                match op {
262                    ContentOperation::BeginText => {
263                        in_text_object = true;
264                        // Reset text matrix to identity
265                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
266                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
267                    }
268
269                    ContentOperation::EndText => {
270                        in_text_object = false;
271                    }
272
273                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
274                        state.text_matrix =
275                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
276                        state.text_line_matrix =
277                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
278                    }
279
280                    ContentOperation::MoveText(tx, ty) => {
281                        // Update text matrix by translation
282                        let new_matrix = multiply_matrix(
283                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
284                            &state.text_line_matrix,
285                        );
286                        state.text_matrix = new_matrix;
287                        state.text_line_matrix = new_matrix;
288                    }
289
290                    ContentOperation::NextLine => {
291                        // Move to next line using current leading
292                        let new_matrix = multiply_matrix(
293                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
294                            &state.text_line_matrix,
295                        );
296                        state.text_matrix = new_matrix;
297                        state.text_line_matrix = new_matrix;
298                    }
299
300                    ContentOperation::ShowText(text) => {
301                        if in_text_object {
302                            let text_bytes = &text;
303                            let decoded = self.decode_text(text_bytes, &state)?;
304
305                            // Calculate position: Apply text_matrix, then CTM
306                            // Concatenate: final = CTM × text_matrix
307                            let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
308                            let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
309
310                            // Add spacing based on position change
311                            if !extracted_text.is_empty() {
312                                let dx = x - last_x;
313                                let dy = (y - last_y).abs();
314
315                                if dy > self.options.newline_threshold {
316                                    extracted_text.push('\n');
317                                } else if dx > self.options.space_threshold * state.font_size {
318                                    extracted_text.push(' ');
319                                }
320                            }
321
322                            extracted_text.push_str(&decoded);
323
324                            // Get font info for accurate width calculation
325                            let font_info = state
326                                .font_name
327                                .as_ref()
328                                .and_then(|name| self.font_cache.get(name));
329
330                            if self.options.preserve_layout {
331                                // Detect bold/italic from font name
332                                let (is_bold, is_italic) = state
333                                    .font_name
334                                    .as_ref()
335                                    .map(|name| parse_font_style(name))
336                                    .unwrap_or((false, false));
337
338                                fragments.push(TextFragment {
339                                    text: decoded.clone(),
340                                    x,
341                                    y,
342                                    width: calculate_text_width(
343                                        &decoded,
344                                        state.font_size,
345                                        font_info,
346                                    ),
347                                    height: state.font_size,
348                                    font_size: state.font_size,
349                                    font_name: state.font_name.clone(),
350                                    is_bold,
351                                    is_italic,
352                                    color: state.fill_color,
353                                });
354                            }
355
356                            // Update position for next text
357                            last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
358                            last_y = y;
359
360                            // Update text matrix for next show operation
361                            let text_width =
362                                calculate_text_width(&decoded, state.font_size, font_info);
363                            let tx = text_width * state.horizontal_scale / 100.0;
364                            state.text_matrix =
365                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
366                        }
367                    }
368
369                    ContentOperation::ShowTextArray(array) => {
370                        if in_text_object {
371                            // Get font info for accurate width calculation
372                            let font_info = state
373                                .font_name
374                                .as_ref()
375                                .and_then(|name| self.font_cache.get(name));
376
377                            for item in array {
378                                match item {
379                                    TextElement::Text(text_bytes) => {
380                                        let decoded = self.decode_text(&text_bytes, &state)?;
381                                        extracted_text.push_str(&decoded);
382
383                                        // Update text matrix
384                                        let text_width = calculate_text_width(
385                                            &decoded,
386                                            state.font_size,
387                                            font_info,
388                                        );
389                                        let tx = text_width * state.horizontal_scale / 100.0;
390                                        state.text_matrix = multiply_matrix(
391                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
392                                            &state.text_matrix,
393                                        );
394                                    }
395                                    TextElement::Spacing(adjustment) => {
396                                        // Text position adjustment (negative = move left)
397                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
398                                        state.text_matrix = multiply_matrix(
399                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
400                                            &state.text_matrix,
401                                        );
402                                    }
403                                }
404                            }
405                        }
406                    }
407
408                    ContentOperation::SetFont(name, size) => {
409                        state.font_name = Some(name);
410                        state.font_size = size as f64;
411                    }
412
413                    ContentOperation::SetLeading(leading) => {
414                        state.leading = leading as f64;
415                    }
416
417                    ContentOperation::SetCharSpacing(spacing) => {
418                        state.char_space = spacing as f64;
419                    }
420
421                    ContentOperation::SetWordSpacing(spacing) => {
422                        state.word_space = spacing as f64;
423                    }
424
425                    ContentOperation::SetHorizontalScaling(scale) => {
426                        state.horizontal_scale = scale as f64;
427                    }
428
429                    ContentOperation::SetTextRise(rise) => {
430                        state.text_rise = rise as f64;
431                    }
432
433                    ContentOperation::SetTextRenderMode(mode) => {
434                        state.render_mode = mode as u8;
435                    }
436
437                    ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
438                        // Update CTM: new_ctm = concat_matrix * current_ctm
439                        let [a0, b0, c0, d0, e0, f0] = state.ctm;
440                        let a = a as f64;
441                        let b = b as f64;
442                        let c = c as f64;
443                        let d = d as f64;
444                        let e = e as f64;
445                        let f = f as f64;
446                        state.ctm = [
447                            a * a0 + b * c0,
448                            a * b0 + b * d0,
449                            c * a0 + d * c0,
450                            c * b0 + d * d0,
451                            e * a0 + f * c0 + e0,
452                            e * b0 + f * d0 + f0,
453                        ];
454                    }
455
456                    // Color operations (Phase 4: Color extraction)
457                    ContentOperation::SetNonStrokingGray(gray) => {
458                        state.fill_color = Some(Color::gray(gray as f64));
459                    }
460
461                    ContentOperation::SetNonStrokingRGB(r, g, b) => {
462                        state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
463                    }
464
465                    ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
466                        state.fill_color =
467                            Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
468                    }
469
470                    _ => {
471                        // Other operations don't affect text extraction
472                    }
473                }
474            }
475        }
476
477        // Sort and process fragments if requested
478        if self.options.sort_by_position && !fragments.is_empty() {
479            self.sort_and_merge_fragments(&mut fragments);
480        }
481
482        // Merge close fragments to eliminate spacing artifacts
483        // This is crucial for table detection and structured data extraction
484        if self.options.preserve_layout && !fragments.is_empty() {
485            fragments = self.merge_close_fragments(&fragments);
486        }
487
488        // Reconstruct text from sorted fragments if layout is preserved
489        if self.options.preserve_layout && !fragments.is_empty() {
490            extracted_text = self.reconstruct_text_from_fragments(&fragments);
491        }
492
493        Ok(ExtractedText {
494            text: extracted_text,
495            fragments,
496        })
497    }
498
499    /// Sort text fragments by position and merge them appropriately
500    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
501        // Sort fragments by Y position (top to bottom) then X position (left to right)
502        fragments.sort_by(|a, b| {
503            // First compare Y position (with threshold for same line)
504            let y_diff = (b.y - a.y).abs();
505            if y_diff < self.options.newline_threshold {
506                // Same line, sort by X position
507                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
508            } else {
509                // Different lines, sort by Y (inverted because PDF Y increases upward)
510                b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
511            }
512        });
513
514        // Detect columns if requested
515        if self.options.detect_columns {
516            self.detect_and_sort_columns(fragments);
517        }
518    }
519
520    /// Detect columns and re-sort fragments accordingly
521    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
522        // Group fragments by approximate Y position
523        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
524        let mut current_line: Vec<&mut TextFragment> = Vec::new();
525        let mut last_y = f64::INFINITY;
526
527        for fragment in fragments.iter_mut() {
528            let fragment_y = fragment.y;
529            if (last_y - fragment_y).abs() > self.options.newline_threshold
530                && !current_line.is_empty()
531            {
532                lines.push(current_line);
533                current_line = Vec::new();
534            }
535            current_line.push(fragment);
536            last_y = fragment_y;
537        }
538        if !current_line.is_empty() {
539            lines.push(current_line);
540        }
541
542        // Detect column boundaries
543        let mut column_boundaries = vec![0.0];
544        for line in &lines {
545            if line.len() > 1 {
546                for i in 0..line.len() - 1 {
547                    let gap = line[i + 1].x - (line[i].x + line[i].width);
548                    if gap > self.options.column_threshold {
549                        let boundary = line[i].x + line[i].width + gap / 2.0;
550                        if !column_boundaries
551                            .iter()
552                            .any(|&b| (b - boundary).abs() < 10.0)
553                        {
554                            column_boundaries.push(boundary);
555                        }
556                    }
557                }
558            }
559        }
560        column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
561
562        // Re-sort fragments by column then Y position
563        if column_boundaries.len() > 1 {
564            fragments.sort_by(|a, b| {
565                // Determine column for each fragment
566                let col_a = column_boundaries
567                    .iter()
568                    .position(|&boundary| a.x < boundary)
569                    .unwrap_or(column_boundaries.len())
570                    - 1;
571                let col_b = column_boundaries
572                    .iter()
573                    .position(|&boundary| b.x < boundary)
574                    .unwrap_or(column_boundaries.len())
575                    - 1;
576
577                if col_a != col_b {
578                    col_a.cmp(&col_b)
579                } else {
580                    // Same column, sort by Y position
581                    b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
582                }
583            });
584        }
585    }
586
587    /// Reconstruct text from sorted fragments
588    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
589        // First, merge consecutive fragments that are very close together
590        let merged_fragments = self.merge_close_fragments(fragments);
591
592        let mut result = String::new();
593        let mut last_y = f64::INFINITY;
594        let mut last_x = 0.0;
595        let mut last_line_ended_with_hyphen = false;
596
597        for fragment in &merged_fragments {
598            // Check if we need a newline
599            let y_diff = (last_y - fragment.y).abs();
600            if !result.is_empty() && y_diff > self.options.newline_threshold {
601                // Handle hyphenation
602                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
603                    // Remove the hyphen and don't add newline
604                    if result.ends_with('-') {
605                        result.pop();
606                    }
607                } else {
608                    result.push('\n');
609                }
610            } else if !result.is_empty() {
611                // Check if we need a space
612                let x_gap = fragment.x - last_x;
613                if x_gap > self.options.space_threshold * fragment.font_size {
614                    result.push(' ');
615                }
616            }
617
618            result.push_str(&fragment.text);
619            last_line_ended_with_hyphen = fragment.text.ends_with('-');
620            last_y = fragment.y;
621            last_x = fragment.x + fragment.width;
622        }
623
624        result
625    }
626
627    /// Merge fragments that are very close together on the same line
628    /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
629    fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
630        if fragments.is_empty() {
631            return Vec::new();
632        }
633
634        let mut merged = Vec::new();
635        let mut current = fragments[0].clone();
636
637        for fragment in &fragments[1..] {
638            // Check if this fragment is on the same line and very close
639            let y_diff = (current.y - fragment.y).abs();
640            let x_gap = fragment.x - (current.x + current.width);
641
642            // Merge if on same line and gap is less than a character width
643            // Use 0.5 * font_size as threshold - this catches most artificial spacing
644            let should_merge = y_diff < 1.0  // Same line (very tight tolerance)
645                && x_gap >= 0.0  // Fragment is to the right
646                && x_gap < fragment.font_size * 0.5; // Gap less than 50% of font size
647
648            if should_merge {
649                // Merge this fragment into current
650                current.text.push_str(&fragment.text);
651                current.width = (fragment.x + fragment.width) - current.x;
652            } else {
653                // Start a new fragment
654                merged.push(current);
655                current = fragment.clone();
656            }
657        }
658
659        merged.push(current);
660        merged
661    }
662
663    /// Extract font resources from page
664    fn extract_font_resources<R: Read + Seek>(
665        &mut self,
666        page: &ParsedPage,
667        document: &PdfDocument<R>,
668    ) -> ParseResult<()> {
669        // Clear previous font cache
670        self.font_cache.clear();
671
672        // Try to get resources manually from page dictionary first
673        // This is necessary because ParsedPage.get_resources() may not always work
674        if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
675            if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
676            {
677                if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
678                    // Extract each font
679                    for (font_name, font_obj) in font_dict.0.iter() {
680                        if let Some(font_ref) = font_obj.as_reference() {
681                            if let Ok(PdfObject::Dictionary(font_dict)) =
682                                document.get_object(font_ref.0, font_ref.1)
683                            {
684                                // Create a CMap extractor to use its font extraction logic
685                                let mut cmap_extractor: CMapTextExtractor<R> =
686                                    CMapTextExtractor::new();
687
688                                if let Ok(font_info) =
689                                    cmap_extractor.extract_font_info(&font_dict, document)
690                                {
691                                    let has_to_unicode = font_info.to_unicode.is_some();
692                                    self.font_cache.insert(font_name.0.clone(), font_info);
693                                    tracing::debug!(
694                                        "Cached font: {} (ToUnicode: {})",
695                                        font_name.0,
696                                        has_to_unicode
697                                    );
698                                }
699                            }
700                        }
701                    }
702                }
703            }
704        } else if let Some(resources) = page.get_resources() {
705            // Fallback to get_resources() if Resources is not a reference
706            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
707                for (font_name, font_obj) in font_dict.0.iter() {
708                    if let Some(font_ref) = font_obj.as_reference() {
709                        if let Ok(PdfObject::Dictionary(font_dict)) =
710                            document.get_object(font_ref.0, font_ref.1)
711                        {
712                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
713
714                            if let Ok(font_info) =
715                                cmap_extractor.extract_font_info(&font_dict, document)
716                            {
717                                let has_to_unicode = font_info.to_unicode.is_some();
718                                self.font_cache.insert(font_name.0.clone(), font_info);
719                                tracing::debug!(
720                                    "Cached font: {} (ToUnicode: {})",
721                                    font_name.0,
722                                    has_to_unicode
723                                );
724                            }
725                        }
726                    }
727                }
728            }
729        }
730
731        Ok(())
732    }
733
734    /// Decode text using the current font encoding and ToUnicode mapping
735    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
736        use crate::text::encoding::TextEncoding;
737
738        // First, try to use cached font information with ToUnicode CMap
739        if let Some(ref font_name) = state.font_name {
740            if let Some(font_info) = self.font_cache.get(font_name) {
741                // Create a temporary CMapTextExtractor to use its decoding logic
742                let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
743
744                // Try CMap-based decoding first
745                if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
746                    // Only accept if we got meaningful text (not all null bytes or garbage)
747                    if !decoded.trim().is_empty()
748                        && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
749                    {
750                        tracing::debug!(
751                            "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
752                            font_name,
753                            text,
754                            decoded
755                        );
756                        return Ok(decoded);
757                    }
758                }
759
760                tracing::debug!(
761                    "CMap decoding failed or produced garbage for font {}, falling back to encoding",
762                    font_name
763                );
764            }
765        }
766
767        // Fall back to encoding-based decoding
768        let encoding = if let Some(ref font_name) = state.font_name {
769            match font_name.to_lowercase().as_str() {
770                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
771                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
772                name if name.contains("standard") => TextEncoding::StandardEncoding,
773                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
774                _ => {
775                    // Default based on common patterns
776                    if font_name.starts_with("Times")
777                        || font_name.starts_with("Helvetica")
778                        || font_name.starts_with("Courier")
779                    {
780                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
781                    } else {
782                        TextEncoding::PdfDocEncoding // Safe default
783                    }
784                }
785            }
786        } else {
787            TextEncoding::WinAnsiEncoding // Default for most PDFs
788        };
789
790        let fallback_result = encoding.decode(text);
791        tracing::debug!(
792            "Fallback encoding decoding: {:?} -> \"{}\"",
793            text,
794            fallback_result
795        );
796        Ok(fallback_result)
797    }
798}
799
800impl Default for TextExtractor {
801    fn default() -> Self {
802        Self::new()
803    }
804}
805
806/// Multiply two transformation matrices
807fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
808    [
809        a[0] * b[0] + a[1] * b[2],
810        a[0] * b[1] + a[1] * b[3],
811        a[2] * b[0] + a[3] * b[2],
812        a[2] * b[1] + a[3] * b[3],
813        a[4] * b[0] + a[5] * b[2] + b[4],
814        a[4] * b[1] + a[5] * b[3] + b[5],
815    ]
816}
817
818/// Transform a point using a transformation matrix
819fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
820    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
821    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
822    (tx, ty)
823}
824
825/// Calculate text width using actual font metrics (including kerning)
826fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
827    // If we have font metrics, use them for accurate width calculation
828    if let Some(font) = font_info {
829        if let Some(ref widths) = font.metrics.widths {
830            let first_char = font.metrics.first_char.unwrap_or(0);
831            let last_char = font.metrics.last_char.unwrap_or(255);
832            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
833
834            let mut total_width = 0.0;
835            let chars: Vec<char> = text.chars().collect();
836
837            for (i, &ch) in chars.iter().enumerate() {
838                let char_code = ch as u32;
839
840                // Get width from Widths array or use missing_width
841                let width = if char_code >= first_char && char_code <= last_char {
842                    let index = (char_code - first_char) as usize;
843                    widths.get(index).copied().unwrap_or(missing_width)
844                } else {
845                    missing_width
846                };
847
848                // Convert from glyph space (1/1000 units) to user space
849                total_width += width / 1000.0 * font_size;
850
851                // Apply kerning if available (for character pairs)
852                if let Some(ref kerning) = font.metrics.kerning {
853                    if i + 1 < chars.len() {
854                        let next_char = chars[i + 1] as u32;
855                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
856                            // Kerning is in FUnits (1/1000), convert to user space
857                            total_width += kern_value / 1000.0 * font_size;
858                        }
859                    }
860                }
861            }
862
863            return total_width;
864        }
865    }
866
867    // Fallback to simplified calculation if no metrics available
868    text.len() as f64 * font_size * 0.5
869}
870
871#[cfg(test)]
872mod tests {
873    use super::*;
874
875    #[test]
876    fn test_matrix_multiplication() {
877        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
878        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
879
880        let result = multiply_matrix(&identity, &translation);
881        assert_eq!(result, translation);
882
883        let result2 = multiply_matrix(&translation, &identity);
884        assert_eq!(result2, translation);
885    }
886
887    #[test]
888    fn test_transform_point() {
889        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
890        let (x, y) = transform_point(5.0, 5.0, &translation);
891        assert_eq!(x, 15.0);
892        assert_eq!(y, 25.0);
893    }
894
895    #[test]
896    fn test_extraction_options_default() {
897        let options = ExtractionOptions::default();
898        assert!(!options.preserve_layout);
899        assert_eq!(options.space_threshold, 0.2);
900        assert_eq!(options.newline_threshold, 10.0);
901        assert!(options.sort_by_position);
902        assert!(!options.detect_columns);
903        assert_eq!(options.column_threshold, 50.0);
904        assert!(options.merge_hyphenated);
905    }
906
907    #[test]
908    fn test_extraction_options_custom() {
909        let options = ExtractionOptions {
910            preserve_layout: true,
911            space_threshold: 0.5,
912            newline_threshold: 15.0,
913            sort_by_position: false,
914            detect_columns: true,
915            column_threshold: 75.0,
916            merge_hyphenated: false,
917        };
918        assert!(options.preserve_layout);
919        assert_eq!(options.space_threshold, 0.5);
920        assert_eq!(options.newline_threshold, 15.0);
921        assert!(!options.sort_by_position);
922        assert!(options.detect_columns);
923        assert_eq!(options.column_threshold, 75.0);
924        assert!(!options.merge_hyphenated);
925    }
926
927    #[test]
928    fn test_parse_font_style_bold() {
929        // PostScript style
930        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
931        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
932
933        // TrueType style
934        assert_eq!(parse_font_style("Arial Bold"), (true, false));
935        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
936
937        // Short form
938        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
939    }
940
941    #[test]
942    fn test_parse_font_style_italic() {
943        // PostScript style
944        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
945        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
946
947        // TrueType style
948        assert_eq!(parse_font_style("Arial Italic"), (false, true));
949        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
950
951        // Short form
952        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
953    }
954
955    #[test]
956    fn test_parse_font_style_bold_italic() {
957        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
958        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
959        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
960    }
961
962    #[test]
963    fn test_parse_font_style_regular() {
964        assert_eq!(parse_font_style("Helvetica"), (false, false));
965        assert_eq!(parse_font_style("Times-Roman"), (false, false));
966        assert_eq!(parse_font_style("Courier"), (false, false));
967        assert_eq!(parse_font_style("Arial"), (false, false));
968    }
969
970    #[test]
971    fn test_parse_font_style_edge_cases() {
972        // Empty and unusual cases
973        assert_eq!(parse_font_style(""), (false, false));
974        assert_eq!(parse_font_style("UnknownFont"), (false, false));
975
976        // Case insensitive
977        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
978        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
979    }
980
981    #[test]
982    fn test_text_fragment() {
983        let fragment = TextFragment {
984            text: "Hello".to_string(),
985            x: 100.0,
986            y: 200.0,
987            width: 50.0,
988            height: 12.0,
989            font_size: 10.0,
990            font_name: None,
991            is_bold: false,
992            is_italic: false,
993            color: None,
994        };
995        assert_eq!(fragment.text, "Hello");
996        assert_eq!(fragment.x, 100.0);
997        assert_eq!(fragment.y, 200.0);
998        assert_eq!(fragment.width, 50.0);
999        assert_eq!(fragment.height, 12.0);
1000        assert_eq!(fragment.font_size, 10.0);
1001    }
1002
1003    #[test]
1004    fn test_extracted_text() {
1005        let fragments = vec![
1006            TextFragment {
1007                text: "Hello".to_string(),
1008                x: 100.0,
1009                y: 200.0,
1010                width: 50.0,
1011                height: 12.0,
1012                font_size: 10.0,
1013                font_name: None,
1014                is_bold: false,
1015                is_italic: false,
1016                color: None,
1017            },
1018            TextFragment {
1019                text: "World".to_string(),
1020                x: 160.0,
1021                y: 200.0,
1022                width: 50.0,
1023                height: 12.0,
1024                font_size: 10.0,
1025                font_name: None,
1026                is_bold: false,
1027                is_italic: false,
1028                color: None,
1029            },
1030        ];
1031
1032        let extracted = ExtractedText {
1033            text: "Hello World".to_string(),
1034            fragments: fragments.clone(),
1035        };
1036
1037        assert_eq!(extracted.text, "Hello World");
1038        assert_eq!(extracted.fragments.len(), 2);
1039        assert_eq!(extracted.fragments[0].text, "Hello");
1040        assert_eq!(extracted.fragments[1].text, "World");
1041    }
1042
1043    #[test]
1044    fn test_text_state_default() {
1045        let state = TextState::default();
1046        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1047        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1048        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1049        assert_eq!(state.leading, 0.0);
1050        assert_eq!(state.char_space, 0.0);
1051        assert_eq!(state.word_space, 0.0);
1052        assert_eq!(state.horizontal_scale, 100.0);
1053        assert_eq!(state.text_rise, 0.0);
1054        assert_eq!(state.font_size, 0.0);
1055        assert!(state.font_name.is_none());
1056        assert_eq!(state.render_mode, 0);
1057    }
1058
1059    #[test]
1060    fn test_matrix_operations() {
1061        // Test rotation matrix
1062        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
1063        let (x, y) = transform_point(1.0, 0.0, &rotation);
1064        assert_eq!(x, 0.0);
1065        assert_eq!(y, 1.0);
1066
1067        // Test scaling matrix
1068        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1069        let (x, y) = transform_point(5.0, 5.0, &scale);
1070        assert_eq!(x, 10.0);
1071        assert_eq!(y, 15.0);
1072
1073        // Test complex transformation
1074        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1075        let (x, y) = transform_point(1.0, 1.0, &complex);
1076        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
1077        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
1078    }
1079
1080    #[test]
1081    fn test_text_extractor_new() {
1082        let extractor = TextExtractor::new();
1083        let options = extractor.options;
1084        assert!(!options.preserve_layout);
1085        assert_eq!(options.space_threshold, 0.2);
1086        assert_eq!(options.newline_threshold, 10.0);
1087        assert!(options.sort_by_position);
1088        assert!(!options.detect_columns);
1089        assert_eq!(options.column_threshold, 50.0);
1090        assert!(options.merge_hyphenated);
1091    }
1092
1093    #[test]
1094    fn test_text_extractor_with_options() {
1095        let options = ExtractionOptions {
1096            preserve_layout: true,
1097            space_threshold: 0.3,
1098            newline_threshold: 12.0,
1099            sort_by_position: false,
1100            detect_columns: true,
1101            column_threshold: 60.0,
1102            merge_hyphenated: false,
1103        };
1104        let extractor = TextExtractor::with_options(options.clone());
1105        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1106        assert_eq!(extractor.options.space_threshold, options.space_threshold);
1107        assert_eq!(
1108            extractor.options.newline_threshold,
1109            options.newline_threshold
1110        );
1111        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1112        assert_eq!(extractor.options.detect_columns, options.detect_columns);
1113        assert_eq!(extractor.options.column_threshold, options.column_threshold);
1114        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1115    }
1116
1117    // =========================================================================
1118    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
1119    // =========================================================================
1120
1121    #[test]
1122    fn test_calculate_text_width_with_no_font_info() {
1123        // Test fallback: should use simplified calculation
1124        let width = calculate_text_width("Hello", 12.0, None);
1125
1126        // Expected: 5 chars * 12.0 * 0.5 = 30.0
1127        assert_eq!(
1128            width, 30.0,
1129            "Without font info, should use simplified calculation: len * font_size * 0.5"
1130        );
1131    }
1132
1133    #[test]
1134    fn test_calculate_text_width_with_empty_metrics() {
1135        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1136
1137        // Font with no widths array
1138        let font_info = FontInfo {
1139            name: "TestFont".to_string(),
1140            font_type: "Type1".to_string(),
1141            encoding: None,
1142            to_unicode: None,
1143            differences: None,
1144            descendant_font: None,
1145            cid_to_gid_map: None,
1146            metrics: FontMetrics {
1147                first_char: None,
1148                last_char: None,
1149                widths: None,
1150                missing_width: Some(500.0),
1151                kerning: None,
1152            },
1153        };
1154
1155        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1156
1157        // Should fall back to simplified calculation
1158        assert_eq!(
1159            width, 30.0,
1160            "Without widths array, should fall back to simplified calculation"
1161        );
1162    }
1163
1164    #[test]
1165    fn test_calculate_text_width_with_complete_metrics() {
1166        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1167
1168        // Font with complete metrics for ASCII range 32-126
1169        // Simulate typical Helvetica widths (in 1/1000 units)
1170        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
1171
1172        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
1173        widths[72 - 32] = 722.0; // 'H' is ASCII 72
1174        widths[101 - 32] = 556.0; // 'e' is ASCII 101
1175        widths[108 - 32] = 278.0; // 'l' is ASCII 108
1176        widths[111 - 32] = 611.0; // 'o' is ASCII 111
1177
1178        let font_info = FontInfo {
1179            name: "Helvetica".to_string(),
1180            font_type: "Type1".to_string(),
1181            encoding: None,
1182            to_unicode: None,
1183            differences: None,
1184            descendant_font: None,
1185            cid_to_gid_map: None,
1186            metrics: FontMetrics {
1187                first_char: Some(32),
1188                last_char: Some(126),
1189                widths: Some(widths),
1190                missing_width: Some(500.0),
1191                kerning: None,
1192            },
1193        };
1194
1195        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1196
1197        // Expected calculation (widths in glyph space / 1000 * font_size):
1198        // H: 722/1000 * 12 = 8.664
1199        // e: 556/1000 * 12 = 6.672
1200        // l: 278/1000 * 12 = 3.336
1201        // l: 278/1000 * 12 = 3.336
1202        // o: 611/1000 * 12 = 7.332
1203        // Total: 29.34
1204        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1205        let tolerance = 0.0001; // Floating point tolerance
1206        assert!(
1207            (width - expected).abs() < tolerance,
1208            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1209            expected,
1210            width,
1211            (width - expected).abs()
1212        );
1213
1214        // Verify it's different from simplified calculation
1215        let simplified = 5.0 * 12.0 * 0.5; // 30.0
1216        assert_ne!(
1217            width, simplified,
1218            "Metrics-based calculation should differ from simplified (30.0)"
1219        );
1220    }
1221
1222    #[test]
1223    fn test_calculate_text_width_character_outside_range() {
1224        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1225
1226        // Font with narrow range (only covers 'A'-'Z')
1227        let widths = vec![722.0; 26]; // All uppercase letters same width
1228
1229        let font_info = FontInfo {
1230            name: "TestFont".to_string(),
1231            font_type: "Type1".to_string(),
1232            encoding: None,
1233            to_unicode: None,
1234            differences: None,
1235            descendant_font: None,
1236            cid_to_gid_map: None,
1237            metrics: FontMetrics {
1238                first_char: Some(65), // 'A'
1239                last_char: Some(90),  // 'Z'
1240                widths: Some(widths),
1241                missing_width: Some(500.0),
1242                kerning: None,
1243            },
1244        };
1245
1246        // Test with character outside range
1247        let width = calculate_text_width("A1", 10.0, Some(&font_info));
1248
1249        // Expected:
1250        // 'A' (65) is in range: 722/1000 * 10 = 7.22
1251        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
1252        // Total: 12.22
1253        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1254        assert_eq!(
1255            width, expected,
1256            "Should use missing_width for characters outside range"
1257        );
1258    }
1259
1260    #[test]
1261    fn test_calculate_text_width_missing_width_in_array() {
1262        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1263
1264        // Font with incomplete widths array (some characters have 0.0)
1265        let mut widths = vec![500.0; 95]; // Default width
1266        widths[10] = 0.0; // Character at index 10 has no width defined
1267
1268        let font_info = FontInfo {
1269            name: "TestFont".to_string(),
1270            font_type: "Type1".to_string(),
1271            encoding: None,
1272            to_unicode: None,
1273            differences: None,
1274            descendant_font: None,
1275            cid_to_gid_map: None,
1276            metrics: FontMetrics {
1277                first_char: Some(32),
1278                last_char: Some(126),
1279                widths: Some(widths),
1280                missing_width: Some(600.0),
1281                kerning: None,
1282            },
1283        };
1284
1285        // Character 42 (index 10 from first_char 32)
1286        let char_code = 42u8 as char; // '*'
1287        let text = char_code.to_string();
1288        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1289
1290        // Character is in range but width is 0.0, should NOT fall back to missing_width
1291        // (0.0 is a valid width for zero-width characters)
1292        assert_eq!(
1293            width, 0.0,
1294            "Should use 0.0 width from array, not missing_width"
1295        );
1296    }
1297
1298    #[test]
1299    fn test_calculate_text_width_empty_string() {
1300        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1301
1302        let font_info = FontInfo {
1303            name: "TestFont".to_string(),
1304            font_type: "Type1".to_string(),
1305            encoding: None,
1306            to_unicode: None,
1307            differences: None,
1308            descendant_font: None,
1309            cid_to_gid_map: None,
1310            metrics: FontMetrics {
1311                first_char: Some(32),
1312                last_char: Some(126),
1313                widths: Some(vec![500.0; 95]),
1314                missing_width: Some(500.0),
1315                kerning: None,
1316            },
1317        };
1318
1319        let width = calculate_text_width("", 12.0, Some(&font_info));
1320        assert_eq!(width, 0.0, "Empty string should have zero width");
1321
1322        // Also test without font info
1323        let width_no_font = calculate_text_width("", 12.0, None);
1324        assert_eq!(
1325            width_no_font, 0.0,
1326            "Empty string should have zero width (no font)"
1327        );
1328    }
1329
1330    #[test]
1331    fn test_calculate_text_width_unicode_characters() {
1332        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1333
1334        // Font with limited ASCII range
1335        let font_info = FontInfo {
1336            name: "TestFont".to_string(),
1337            font_type: "Type1".to_string(),
1338            encoding: None,
1339            to_unicode: None,
1340            differences: None,
1341            descendant_font: None,
1342            cid_to_gid_map: None,
1343            metrics: FontMetrics {
1344                first_char: Some(32),
1345                last_char: Some(126),
1346                widths: Some(vec![500.0; 95]),
1347                missing_width: Some(600.0),
1348                kerning: None,
1349            },
1350        };
1351
1352        // Test with Unicode characters outside ASCII range
1353        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1354
1355        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1356        // Expected: 600/1000 * 10 = 6.0
1357        assert_eq!(
1358            width, 6.0,
1359            "Unicode character outside range should use missing_width"
1360        );
1361    }
1362
1363    #[test]
1364    fn test_calculate_text_width_different_font_sizes() {
1365        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1366
1367        let font_info = FontInfo {
1368            name: "TestFont".to_string(),
1369            font_type: "Type1".to_string(),
1370            encoding: None,
1371            to_unicode: None,
1372            differences: None,
1373            descendant_font: None,
1374            cid_to_gid_map: None,
1375            metrics: FontMetrics {
1376                first_char: Some(65), // 'A'
1377                last_char: Some(65),  // 'A'
1378                widths: Some(vec![722.0]),
1379                missing_width: Some(500.0),
1380                kerning: None,
1381            },
1382        };
1383
1384        // Test same character with different font sizes
1385        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1386        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1387
1388        // Widths should scale linearly with font size
1389        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1390        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1391        assert_eq!(
1392            width_20,
1393            width_10 * 2.0,
1394            "Width should scale linearly with font size"
1395        );
1396    }
1397
1398    #[test]
1399    fn test_calculate_text_width_proportional_vs_monospace() {
1400        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1401
1402        // Simulate proportional font (different widths)
1403        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1404        let proportional_font = FontInfo {
1405            name: "Helvetica".to_string(),
1406            font_type: "Type1".to_string(),
1407            encoding: None,
1408            to_unicode: None,
1409            differences: None,
1410            descendant_font: None,
1411            cid_to_gid_map: None,
1412            metrics: FontMetrics {
1413                first_char: Some(105), // 'i'
1414                last_char: Some(107),  // covers i, j, k
1415                widths: Some(proportional_widths),
1416                missing_width: Some(500.0),
1417                kerning: None,
1418            },
1419        };
1420
1421        // Simulate monospace font (same width)
1422        let monospace_widths = vec![600.0, 600.0, 600.0];
1423        let monospace_font = FontInfo {
1424            name: "Courier".to_string(),
1425            font_type: "Type1".to_string(),
1426            encoding: None,
1427            to_unicode: None,
1428            differences: None,
1429            descendant_font: None,
1430            cid_to_gid_map: None,
1431            metrics: FontMetrics {
1432                first_char: Some(105),
1433                last_char: Some(107),
1434                widths: Some(monospace_widths),
1435                missing_width: Some(600.0),
1436                kerning: None,
1437            },
1438        };
1439
1440        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1441        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1442
1443        // Proportional 'i' should be narrower than monospace 'i'
1444        assert!(
1445            prop_width < mono_width,
1446            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1447            prop_width,
1448            mono_width
1449        );
1450    }
1451
1452    // =========================================================================
1453    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1454    // =========================================================================
1455
1456    #[test]
1457    fn test_calculate_text_width_with_kerning() {
1458        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1459        use std::collections::HashMap;
1460
1461        // Create a font with kerning pairs
1462        let mut widths = vec![500.0; 95]; // ASCII 32-126
1463        widths[65 - 32] = 722.0; // 'A'
1464        widths[86 - 32] = 722.0; // 'V'
1465        widths[87 - 32] = 944.0; // 'W'
1466
1467        let mut kerning = HashMap::new();
1468        // Typical kerning pairs (in FUnits, 1/1000)
1469        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1470        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1471
1472        let font_info = FontInfo {
1473            name: "Helvetica".to_string(),
1474            font_type: "Type1".to_string(),
1475            encoding: None,
1476            to_unicode: None,
1477            differences: None,
1478            descendant_font: None,
1479            cid_to_gid_map: None,
1480            metrics: FontMetrics {
1481                first_char: Some(32),
1482                last_char: Some(126),
1483                widths: Some(widths),
1484                missing_width: Some(500.0),
1485                kerning: Some(kerning),
1486            },
1487        };
1488
1489        // Test "AV" with kerning
1490        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1491        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1492        //         = 17.328 - 0.6 = 16.728
1493        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1494        let tolerance = 0.0001;
1495        assert!(
1496            (width_av - expected_av).abs() < tolerance,
1497            "AV with kerning: expected {}, got {}, diff {}",
1498            expected_av,
1499            width_av,
1500            (width_av - expected_av).abs()
1501        );
1502
1503        // Test "AW" with different kerning value
1504        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1505        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1506        //         = 19.992 - 0.48 = 19.512
1507        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1508        assert!(
1509            (width_aw - expected_aw).abs() < tolerance,
1510            "AW with kerning: expected {}, got {}, diff {}",
1511            expected_aw,
1512            width_aw,
1513            (width_aw - expected_aw).abs()
1514        );
1515
1516        // Test "VA" with NO kerning (pair not in HashMap)
1517        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1518        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1519        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1520        assert!(
1521            (width_va - expected_va).abs() < tolerance,
1522            "VA without kerning: expected {}, got {}, diff {}",
1523            expected_va,
1524            width_va,
1525            (width_va - expected_va).abs()
1526        );
1527
1528        // Verify kerning makes a measurable difference
1529        assert!(
1530            width_av < width_va,
1531            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1532            width_av,
1533            width_va
1534        );
1535    }
1536
1537    #[test]
1538    fn test_parse_truetype_kern_table_minimal() {
1539        use crate::text::extraction_cmap::parse_truetype_kern_table;
1540
1541        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1542        // Structure:
1543        // 1. Offset table (12 bytes)
1544        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1545        // 3. 'head' table data (54 bytes)
1546        // 4. 'kern' table data (30 bytes)
1547        // Total: 128 bytes
1548        let mut ttf_data = vec![
1549            // Offset table
1550            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1551            0x00, 0x02, // numTables: 2
1552            0x00, 0x20, // searchRange: 32
1553            0x00, 0x01, // entrySelector: 1
1554            0x00, 0x00, // rangeShift: 0
1555        ];
1556
1557        // Table directory entry 1: 'head' table
1558        ttf_data.extend_from_slice(b"head"); // tag
1559        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1560        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1561        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1562
1563        // Table directory entry 2: 'kern' table
1564        ttf_data.extend_from_slice(b"kern"); // tag
1565        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1566        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1567        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1568
1569        // 'head' table data (54 bytes of zeros - minimal valid head table)
1570        ttf_data.extend_from_slice(&[0u8; 54]);
1571
1572        // 'kern' table data (34 bytes)
1573        ttf_data.extend_from_slice(&[
1574            // Kern table header
1575            0x00, 0x00, // version: 0
1576            0x00, 0x01, // nTables: 1
1577            // Subtable header
1578            0x00, 0x00, // version: 0
1579            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1580            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1581            0x00, 0x02, // nPairs: 2
1582            0x00, 0x08, // searchRange: 8
1583            0x00, 0x00, // entrySelector: 0
1584            0x00, 0x04, // rangeShift: 4
1585            // Kerning pair 1: A + V → -50
1586            0x00, 0x41, // left glyph: 65 ('A')
1587            0x00, 0x56, // right glyph: 86 ('V')
1588            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1589            // Kerning pair 2: A + W → -40
1590            0x00, 0x41, // left glyph: 65 ('A')
1591            0x00, 0x57, // right glyph: 87 ('W')
1592            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1593        ]);
1594
1595        let result = parse_truetype_kern_table(&ttf_data);
1596        assert!(
1597            result.is_ok(),
1598            "Should parse minimal kern table successfully: {:?}",
1599            result.err()
1600        );
1601
1602        let kerning_map = result.unwrap();
1603        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1604
1605        // Verify pair 1: A + V → -50
1606        assert_eq!(
1607            kerning_map.get(&(65, 86)),
1608            Some(&-50.0),
1609            "Should have A+V kerning pair with value -50"
1610        );
1611
1612        // Verify pair 2: A + W → -40
1613        assert_eq!(
1614            kerning_map.get(&(65, 87)),
1615            Some(&-40.0),
1616            "Should have A+W kerning pair with value -40"
1617        );
1618    }
1619
1620    #[test]
1621    fn test_parse_kern_table_no_kern_table() {
1622        use crate::text::extraction_cmap::extract_truetype_kerning;
1623
1624        // TrueType font data WITHOUT a 'kern' table
1625        // Structure:
1626        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1627        // - Table directory: 1 entry for 'head' table (not 'kern')
1628        let ttf_data = vec![
1629            // Offset table
1630            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1631            0x00, 0x01, // numTables: 1
1632            0x00, 0x10, // searchRange: 16
1633            0x00, 0x00, // entrySelector: 0
1634            0x00, 0x00, // rangeShift: 0
1635            // Table directory entry: 'head' table (not 'kern')
1636            b'h', b'e', b'a', b'd', // tag: 'head'
1637            0x00, 0x00, 0x00, 0x00, // checksum
1638            0x00, 0x00, 0x00, 0x1C, // offset: 28
1639            0x00, 0x00, 0x00, 0x36, // length: 54
1640            // Mock 'head' table data (54 bytes of zeros)
1641            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1642            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1643            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1644            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1645        ];
1646
1647        let result = extract_truetype_kerning(&ttf_data);
1648        assert!(
1649            result.is_ok(),
1650            "Should gracefully handle missing kern table"
1651        );
1652
1653        let kerning_map = result.unwrap();
1654        assert!(
1655            kerning_map.is_empty(),
1656            "Should return empty HashMap when no kern table exists"
1657        );
1658    }
1659}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs