oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::objects::PdfObject;
9use crate::parser::page_tree::ParsedPage;
10use crate::parser::ParseResult;
11use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
12use std::collections::HashMap;
13use std::io::{Read, Seek};
14
15/// Text extraction options
16#[derive(Debug, Clone)]
17pub struct ExtractionOptions {
18    /// Preserve the original layout (spacing and positioning)
19    pub preserve_layout: bool,
20    /// Minimum space width to insert space character (in text space units)
21    pub space_threshold: f64,
22    /// Minimum vertical distance to insert newline (in text space units)
23    pub newline_threshold: f64,
24    /// Sort text fragments by position (useful for multi-column layouts)
25    pub sort_by_position: bool,
26    /// Detect and handle columns
27    pub detect_columns: bool,
28    /// Column separation threshold (in page units)
29    pub column_threshold: f64,
30    /// Merge hyphenated words at line ends
31    pub merge_hyphenated: bool,
32}
33
34impl Default for ExtractionOptions {
35    fn default() -> Self {
36        Self {
37            preserve_layout: false,
38            space_threshold: 0.2,
39            newline_threshold: 10.0,
40            sort_by_position: true,
41            detect_columns: false,
42            column_threshold: 50.0,
43            merge_hyphenated: true,
44        }
45    }
46}
47
48/// Extracted text with position information
49#[derive(Debug, Clone)]
50pub struct ExtractedText {
51    /// The extracted text content
52    pub text: String,
53    /// Text fragments with position information (if preserve_layout is true)
54    pub fragments: Vec<TextFragment>,
55}
56
57/// A fragment of text with position information
58#[derive(Debug, Clone)]
59pub struct TextFragment {
60    /// Text content
61    pub text: String,
62    /// X position in page coordinates
63    pub x: f64,
64    /// Y position in page coordinates
65    pub y: f64,
66    /// Width of the text
67    pub width: f64,
68    /// Height of the text
69    pub height: f64,
70    /// Font size
71    pub font_size: f64,
72    /// Font name (if known) - used for kerning-aware text spacing
73    pub font_name: Option<String>,
74    /// Whether the font is bold (detected from font name)
75    pub is_bold: bool,
76    /// Whether the font is italic (detected from font name)
77    pub is_italic: bool,
78}
79
80/// Text extraction state
81struct TextState {
82    /// Current text matrix
83    text_matrix: [f64; 6],
84    /// Current text line matrix
85    text_line_matrix: [f64; 6],
86    /// Current transformation matrix (CTM)
87    ctm: [f64; 6],
88    /// Text leading (line spacing)
89    leading: f64,
90    /// Character spacing
91    char_space: f64,
92    /// Word spacing
93    word_space: f64,
94    /// Horizontal scaling
95    horizontal_scale: f64,
96    /// Text rise
97    text_rise: f64,
98    /// Current font size
99    font_size: f64,
100    /// Current font name
101    font_name: Option<String>,
102    /// Render mode (0 = fill, 1 = stroke, etc.)
103    render_mode: u8,
104}
105
106impl Default for TextState {
107    fn default() -> Self {
108        Self {
109            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
110            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
111            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
112            leading: 0.0,
113            char_space: 0.0,
114            word_space: 0.0,
115            horizontal_scale: 100.0,
116            text_rise: 0.0,
117            font_size: 0.0,
118            font_name: None,
119            render_mode: 0,
120        }
121    }
122}
123
124/// Parse font style (bold/italic) from font name
125///
126/// Detects bold and italic styles from common font naming patterns.
127/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
128/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
129///
130/// # Examples
131///
132/// ```
133/// use oxidize_pdf::text::extraction::parse_font_style;
134///
135/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
136/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
137/// assert_eq!(parse_font_style("Courier"), (false, false));
138/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
139/// ```
140///
141/// # Returns
142///
143/// Tuple of (is_bold, is_italic)
144pub fn parse_font_style(font_name: &str) -> (bool, bool) {
145    let name_lower = font_name.to_lowercase();
146
147    // Detect bold from common patterns
148    let is_bold = name_lower.contains("bold")
149        || name_lower.contains("-b")
150        || name_lower.contains(" b ")
151        || name_lower.ends_with(" b");
152
153    // Detect italic/oblique from common patterns
154    let is_italic = name_lower.contains("italic")
155        || name_lower.contains("oblique")
156        || name_lower.contains("-i")
157        || name_lower.contains(" i ")
158        || name_lower.ends_with(" i");
159
160    (is_bold, is_italic)
161}
162
163/// Text extractor for PDF pages with CMap support
164pub struct TextExtractor {
165    options: ExtractionOptions,
166    /// Font cache for the current extraction
167    font_cache: HashMap<String, FontInfo>,
168}
169
170impl TextExtractor {
171    /// Create a new text extractor with default options
172    pub fn new() -> Self {
173        Self {
174            options: ExtractionOptions::default(),
175            font_cache: HashMap::new(),
176        }
177    }
178
179    /// Create a text extractor with custom options
180    pub fn with_options(options: ExtractionOptions) -> Self {
181        Self {
182            options,
183            font_cache: HashMap::new(),
184        }
185    }
186
187    /// Extract text from a PDF document
188    pub fn extract_from_document<R: Read + Seek>(
189        &mut self,
190        document: &PdfDocument<R>,
191    ) -> ParseResult<Vec<ExtractedText>> {
192        let page_count = document.page_count()?;
193        let mut results = Vec::new();
194
195        for i in 0..page_count {
196            let text = self.extract_from_page(document, i)?;
197            results.push(text);
198        }
199
200        Ok(results)
201    }
202
203    /// Extract text from a specific page
204    pub fn extract_from_page<R: Read + Seek>(
205        &mut self,
206        document: &PdfDocument<R>,
207        page_index: u32,
208    ) -> ParseResult<ExtractedText> {
209        // Get the page
210        let page = document.get_page(page_index)?;
211
212        // Extract font resources first
213        self.extract_font_resources(&page, document)?;
214
215        // Get content streams
216        let streams = page.content_streams_with_document(document)?;
217
218        let mut extracted_text = String::new();
219        let mut fragments = Vec::new();
220        let mut state = TextState::default();
221        let mut in_text_object = false;
222        let mut last_x = 0.0;
223        let mut last_y = 0.0;
224
225        // Process each content stream
226        for (stream_idx, stream_data) in streams.iter().enumerate() {
227            let operations = match ContentParser::parse_content(stream_data) {
228                Ok(ops) => ops,
229                Err(e) => {
230                    // Enhanced diagnostic logging for content stream parsing failures
231                    eprintln!(
232                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
233                        page_index + 1,
234                        stream_idx + 1,
235                        streams.len()
236                    );
237                    eprintln!("         Error: {}", e);
238                    eprintln!("         Stream size: {} bytes", stream_data.len());
239
240                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
241                    let preview_len = stream_data.len().min(100);
242                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
243                    eprintln!(
244                        "         Stream preview (first {} bytes): {:?}",
245                        preview_len,
246                        preview.chars().take(80).collect::<String>()
247                    );
248
249                    // Continue processing other streams
250                    continue;
251                }
252            };
253
254            for op in operations {
255                match op {
256                    ContentOperation::BeginText => {
257                        in_text_object = true;
258                        // Reset text matrix to identity
259                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
260                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
261                    }
262
263                    ContentOperation::EndText => {
264                        in_text_object = false;
265                    }
266
267                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
268                        state.text_matrix =
269                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
270                        state.text_line_matrix =
271                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
272                    }
273
274                    ContentOperation::MoveText(tx, ty) => {
275                        // Update text matrix by translation
276                        let new_matrix = multiply_matrix(
277                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
278                            &state.text_line_matrix,
279                        );
280                        state.text_matrix = new_matrix;
281                        state.text_line_matrix = new_matrix;
282                    }
283
284                    ContentOperation::NextLine => {
285                        // Move to next line using current leading
286                        let new_matrix = multiply_matrix(
287                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
288                            &state.text_line_matrix,
289                        );
290                        state.text_matrix = new_matrix;
291                        state.text_line_matrix = new_matrix;
292                    }
293
294                    ContentOperation::ShowText(text) => {
295                        if in_text_object {
296                            let text_bytes = &text;
297                            let decoded = self.decode_text(text_bytes, &state)?;
298
299                            // Calculate position: Apply text_matrix, then CTM
300                            // Concatenate: final = CTM × text_matrix
301                            let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
302                            let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
303
304                            // Add spacing based on position change
305                            if !extracted_text.is_empty() {
306                                let dx = x - last_x;
307                                let dy = (y - last_y).abs();
308
309                                if dy > self.options.newline_threshold {
310                                    extracted_text.push('\n');
311                                } else if dx > self.options.space_threshold * state.font_size {
312                                    extracted_text.push(' ');
313                                }
314                            }
315
316                            extracted_text.push_str(&decoded);
317
318                            // Get font info for accurate width calculation
319                            let font_info = state
320                                .font_name
321                                .as_ref()
322                                .and_then(|name| self.font_cache.get(name));
323
324                            if self.options.preserve_layout {
325                                // Detect bold/italic from font name
326                                let (is_bold, is_italic) = state
327                                    .font_name
328                                    .as_ref()
329                                    .map(|name| parse_font_style(name))
330                                    .unwrap_or((false, false));
331
332                                fragments.push(TextFragment {
333                                    text: decoded.clone(),
334                                    x,
335                                    y,
336                                    width: calculate_text_width(
337                                        &decoded,
338                                        state.font_size,
339                                        font_info,
340                                    ),
341                                    height: state.font_size,
342                                    font_size: state.font_size,
343                                    font_name: state.font_name.clone(),
344                                    is_bold,
345                                    is_italic,
346                                });
347                            }
348
349                            // Update position for next text
350                            last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
351                            last_y = y;
352
353                            // Update text matrix for next show operation
354                            let text_width =
355                                calculate_text_width(&decoded, state.font_size, font_info);
356                            let tx = text_width * state.horizontal_scale / 100.0;
357                            state.text_matrix =
358                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
359                        }
360                    }
361
362                    ContentOperation::ShowTextArray(array) => {
363                        if in_text_object {
364                            // Get font info for accurate width calculation
365                            let font_info = state
366                                .font_name
367                                .as_ref()
368                                .and_then(|name| self.font_cache.get(name));
369
370                            for item in array {
371                                match item {
372                                    TextElement::Text(text_bytes) => {
373                                        let decoded = self.decode_text(&text_bytes, &state)?;
374                                        extracted_text.push_str(&decoded);
375
376                                        // Update text matrix
377                                        let text_width = calculate_text_width(
378                                            &decoded,
379                                            state.font_size,
380                                            font_info,
381                                        );
382                                        let tx = text_width * state.horizontal_scale / 100.0;
383                                        state.text_matrix = multiply_matrix(
384                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
385                                            &state.text_matrix,
386                                        );
387                                    }
388                                    TextElement::Spacing(adjustment) => {
389                                        // Text position adjustment (negative = move left)
390                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
391                                        state.text_matrix = multiply_matrix(
392                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
393                                            &state.text_matrix,
394                                        );
395                                    }
396                                }
397                            }
398                        }
399                    }
400
401                    ContentOperation::SetFont(name, size) => {
402                        state.font_name = Some(name);
403                        state.font_size = size as f64;
404                    }
405
406                    ContentOperation::SetLeading(leading) => {
407                        state.leading = leading as f64;
408                    }
409
410                    ContentOperation::SetCharSpacing(spacing) => {
411                        state.char_space = spacing as f64;
412                    }
413
414                    ContentOperation::SetWordSpacing(spacing) => {
415                        state.word_space = spacing as f64;
416                    }
417
418                    ContentOperation::SetHorizontalScaling(scale) => {
419                        state.horizontal_scale = scale as f64;
420                    }
421
422                    ContentOperation::SetTextRise(rise) => {
423                        state.text_rise = rise as f64;
424                    }
425
426                    ContentOperation::SetTextRenderMode(mode) => {
427                        state.render_mode = mode as u8;
428                    }
429
430                    ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
431                        // Update CTM: new_ctm = concat_matrix * current_ctm
432                        let [a0, b0, c0, d0, e0, f0] = state.ctm;
433                        let a = a as f64;
434                        let b = b as f64;
435                        let c = c as f64;
436                        let d = d as f64;
437                        let e = e as f64;
438                        let f = f as f64;
439                        state.ctm = [
440                            a * a0 + b * c0,
441                            a * b0 + b * d0,
442                            c * a0 + d * c0,
443                            c * b0 + d * d0,
444                            e * a0 + f * c0 + e0,
445                            e * b0 + f * d0 + f0,
446                        ];
447                    }
448
449                    _ => {
450                        // Other operations don't affect text extraction
451                    }
452                }
453            }
454        }
455
456        // Sort and process fragments if requested
457        if self.options.sort_by_position && !fragments.is_empty() {
458            self.sort_and_merge_fragments(&mut fragments);
459        }
460
461        // Reconstruct text from sorted fragments if layout is preserved
462        if self.options.preserve_layout && !fragments.is_empty() {
463            extracted_text = self.reconstruct_text_from_fragments(&fragments);
464        }
465
466        Ok(ExtractedText {
467            text: extracted_text,
468            fragments,
469        })
470    }
471
472    /// Sort text fragments by position and merge them appropriately
473    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
474        // Sort fragments by Y position (top to bottom) then X position (left to right)
475        fragments.sort_by(|a, b| {
476            // First compare Y position (with threshold for same line)
477            let y_diff = (b.y - a.y).abs();
478            if y_diff < self.options.newline_threshold {
479                // Same line, sort by X position
480                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
481            } else {
482                // Different lines, sort by Y (inverted because PDF Y increases upward)
483                b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
484            }
485        });
486
487        // Detect columns if requested
488        if self.options.detect_columns {
489            self.detect_and_sort_columns(fragments);
490        }
491    }
492
493    /// Detect columns and re-sort fragments accordingly
494    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
495        // Group fragments by approximate Y position
496        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
497        let mut current_line: Vec<&mut TextFragment> = Vec::new();
498        let mut last_y = f64::INFINITY;
499
500        for fragment in fragments.iter_mut() {
501            let fragment_y = fragment.y;
502            if (last_y - fragment_y).abs() > self.options.newline_threshold
503                && !current_line.is_empty()
504            {
505                lines.push(current_line);
506                current_line = Vec::new();
507            }
508            current_line.push(fragment);
509            last_y = fragment_y;
510        }
511        if !current_line.is_empty() {
512            lines.push(current_line);
513        }
514
515        // Detect column boundaries
516        let mut column_boundaries = vec![0.0];
517        for line in &lines {
518            if line.len() > 1 {
519                for i in 0..line.len() - 1 {
520                    let gap = line[i + 1].x - (line[i].x + line[i].width);
521                    if gap > self.options.column_threshold {
522                        let boundary = line[i].x + line[i].width + gap / 2.0;
523                        if !column_boundaries
524                            .iter()
525                            .any(|&b| (b - boundary).abs() < 10.0)
526                        {
527                            column_boundaries.push(boundary);
528                        }
529                    }
530                }
531            }
532        }
533        column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
534
535        // Re-sort fragments by column then Y position
536        if column_boundaries.len() > 1 {
537            fragments.sort_by(|a, b| {
538                // Determine column for each fragment
539                let col_a = column_boundaries
540                    .iter()
541                    .position(|&boundary| a.x < boundary)
542                    .unwrap_or(column_boundaries.len())
543                    - 1;
544                let col_b = column_boundaries
545                    .iter()
546                    .position(|&boundary| b.x < boundary)
547                    .unwrap_or(column_boundaries.len())
548                    - 1;
549
550                if col_a != col_b {
551                    col_a.cmp(&col_b)
552                } else {
553                    // Same column, sort by Y position
554                    b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
555                }
556            });
557        }
558    }
559
560    /// Reconstruct text from sorted fragments
561    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
562        let mut result = String::new();
563        let mut last_y = f64::INFINITY;
564        let mut last_x = 0.0;
565        let mut last_line_ended_with_hyphen = false;
566
567        for fragment in fragments {
568            // Check if we need a newline
569            let y_diff = (last_y - fragment.y).abs();
570            if !result.is_empty() && y_diff > self.options.newline_threshold {
571                // Handle hyphenation
572                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
573                    // Remove the hyphen and don't add newline
574                    if result.ends_with('-') {
575                        result.pop();
576                    }
577                } else {
578                    result.push('\n');
579                }
580            } else if !result.is_empty() {
581                // Check if we need a space
582                let x_gap = fragment.x - last_x;
583                if x_gap > self.options.space_threshold * fragment.font_size {
584                    result.push(' ');
585                }
586            }
587
588            result.push_str(&fragment.text);
589            last_line_ended_with_hyphen = fragment.text.ends_with('-');
590            last_y = fragment.y;
591            last_x = fragment.x + fragment.width;
592        }
593
594        result
595    }
596
597    /// Extract font resources from page
598    fn extract_font_resources<R: Read + Seek>(
599        &mut self,
600        page: &ParsedPage,
601        document: &PdfDocument<R>,
602    ) -> ParseResult<()> {
603        // Clear previous font cache
604        self.font_cache.clear();
605
606        // Get page resources
607        if let Some(resources) = page.get_resources() {
608            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
609                // Extract each font
610                for (font_name, font_obj) in font_dict.0.iter() {
611                    if let Some(font_ref) = font_obj.as_reference() {
612                        if let Ok(PdfObject::Dictionary(font_dict)) =
613                            document.get_object(font_ref.0, font_ref.1)
614                        {
615                            // Create a CMap extractor to use its font extraction logic
616                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
617
618                            if let Ok(font_info) =
619                                cmap_extractor.extract_font_info(&font_dict, document)
620                            {
621                                self.font_cache.insert(font_name.0.clone(), font_info);
622                                tracing::debug!(
623                                    "Cached font: {} -> {:?}",
624                                    font_name.0,
625                                    self.font_cache.get(&font_name.0)
626                                );
627                            }
628                        }
629                    }
630                }
631            }
632        }
633
634        Ok(())
635    }
636
637    /// Decode text using the current font encoding and ToUnicode mapping
638    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
639        use crate::text::encoding::TextEncoding;
640
641        // First, try to use cached font information with ToUnicode CMap
642        if let Some(ref font_name) = state.font_name {
643            if let Some(font_info) = self.font_cache.get(font_name) {
644                // Create a temporary CMapTextExtractor to use its decoding logic
645                let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
646
647                // Try CMap-based decoding first
648                if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
649                    tracing::debug!(
650                        "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
651                        font_name,
652                        text,
653                        decoded
654                    );
655                    return Ok(decoded);
656                }
657
658                tracing::debug!(
659                    "CMap decoding failed for font {}, falling back to encoding",
660                    font_name
661                );
662            }
663        }
664
665        // Fall back to encoding-based decoding
666        let encoding = if let Some(ref font_name) = state.font_name {
667            match font_name.to_lowercase().as_str() {
668                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
669                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
670                name if name.contains("standard") => TextEncoding::StandardEncoding,
671                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
672                _ => {
673                    // Default based on common patterns
674                    if font_name.starts_with("Times")
675                        || font_name.starts_with("Helvetica")
676                        || font_name.starts_with("Courier")
677                    {
678                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
679                    } else {
680                        TextEncoding::PdfDocEncoding // Safe default
681                    }
682                }
683            }
684        } else {
685            TextEncoding::WinAnsiEncoding // Default for most PDFs
686        };
687
688        let fallback_result = encoding.decode(text);
689        tracing::debug!(
690            "Fallback encoding decoding: {:?} -> \"{}\"",
691            text,
692            fallback_result
693        );
694        Ok(fallback_result)
695    }
696}
697
698impl Default for TextExtractor {
699    fn default() -> Self {
700        Self::new()
701    }
702}
703
704/// Multiply two transformation matrices
705fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
706    [
707        a[0] * b[0] + a[1] * b[2],
708        a[0] * b[1] + a[1] * b[3],
709        a[2] * b[0] + a[3] * b[2],
710        a[2] * b[1] + a[3] * b[3],
711        a[4] * b[0] + a[5] * b[2] + b[4],
712        a[4] * b[1] + a[5] * b[3] + b[5],
713    ]
714}
715
716/// Transform a point using a transformation matrix
717fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
718    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
719    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
720    (tx, ty)
721}
722
723/// Calculate text width using actual font metrics (including kerning)
724fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
725    // If we have font metrics, use them for accurate width calculation
726    if let Some(font) = font_info {
727        if let Some(ref widths) = font.metrics.widths {
728            let first_char = font.metrics.first_char.unwrap_or(0);
729            let last_char = font.metrics.last_char.unwrap_or(255);
730            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
731
732            let mut total_width = 0.0;
733            let chars: Vec<char> = text.chars().collect();
734
735            for (i, &ch) in chars.iter().enumerate() {
736                let char_code = ch as u32;
737
738                // Get width from Widths array or use missing_width
739                let width = if char_code >= first_char && char_code <= last_char {
740                    let index = (char_code - first_char) as usize;
741                    widths.get(index).copied().unwrap_or(missing_width)
742                } else {
743                    missing_width
744                };
745
746                // Convert from glyph space (1/1000 units) to user space
747                total_width += width / 1000.0 * font_size;
748
749                // Apply kerning if available (for character pairs)
750                if let Some(ref kerning) = font.metrics.kerning {
751                    if i + 1 < chars.len() {
752                        let next_char = chars[i + 1] as u32;
753                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
754                            // Kerning is in FUnits (1/1000), convert to user space
755                            total_width += kern_value / 1000.0 * font_size;
756                        }
757                    }
758                }
759            }
760
761            return total_width;
762        }
763    }
764
765    // Fallback to simplified calculation if no metrics available
766    text.len() as f64 * font_size * 0.5
767}
768
769#[cfg(test)]
770mod tests {
771    use super::*;
772
773    #[test]
774    fn test_matrix_multiplication() {
775        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
776        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
777
778        let result = multiply_matrix(&identity, &translation);
779        assert_eq!(result, translation);
780
781        let result2 = multiply_matrix(&translation, &identity);
782        assert_eq!(result2, translation);
783    }
784
785    #[test]
786    fn test_transform_point() {
787        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
788        let (x, y) = transform_point(5.0, 5.0, &translation);
789        assert_eq!(x, 15.0);
790        assert_eq!(y, 25.0);
791    }
792
793    #[test]
794    fn test_extraction_options_default() {
795        let options = ExtractionOptions::default();
796        assert!(!options.preserve_layout);
797        assert_eq!(options.space_threshold, 0.2);
798        assert_eq!(options.newline_threshold, 10.0);
799        assert!(options.sort_by_position);
800        assert!(!options.detect_columns);
801        assert_eq!(options.column_threshold, 50.0);
802        assert!(options.merge_hyphenated);
803    }
804
805    #[test]
806    fn test_extraction_options_custom() {
807        let options = ExtractionOptions {
808            preserve_layout: true,
809            space_threshold: 0.5,
810            newline_threshold: 15.0,
811            sort_by_position: false,
812            detect_columns: true,
813            column_threshold: 75.0,
814            merge_hyphenated: false,
815        };
816        assert!(options.preserve_layout);
817        assert_eq!(options.space_threshold, 0.5);
818        assert_eq!(options.newline_threshold, 15.0);
819        assert!(!options.sort_by_position);
820        assert!(options.detect_columns);
821        assert_eq!(options.column_threshold, 75.0);
822        assert!(!options.merge_hyphenated);
823    }
824
825    #[test]
826    fn test_parse_font_style_bold() {
827        // PostScript style
828        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
829        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
830
831        // TrueType style
832        assert_eq!(parse_font_style("Arial Bold"), (true, false));
833        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
834
835        // Short form
836        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
837    }
838
839    #[test]
840    fn test_parse_font_style_italic() {
841        // PostScript style
842        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
843        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
844
845        // TrueType style
846        assert_eq!(parse_font_style("Arial Italic"), (false, true));
847        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
848
849        // Short form
850        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
851    }
852
853    #[test]
854    fn test_parse_font_style_bold_italic() {
855        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
856        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
857        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
858    }
859
860    #[test]
861    fn test_parse_font_style_regular() {
862        assert_eq!(parse_font_style("Helvetica"), (false, false));
863        assert_eq!(parse_font_style("Times-Roman"), (false, false));
864        assert_eq!(parse_font_style("Courier"), (false, false));
865        assert_eq!(parse_font_style("Arial"), (false, false));
866    }
867
868    #[test]
869    fn test_parse_font_style_edge_cases() {
870        // Empty and unusual cases
871        assert_eq!(parse_font_style(""), (false, false));
872        assert_eq!(parse_font_style("UnknownFont"), (false, false));
873
874        // Case insensitive
875        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
876        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
877    }
878
879    #[test]
880    fn test_text_fragment() {
881        let fragment = TextFragment {
882            text: "Hello".to_string(),
883            x: 100.0,
884            y: 200.0,
885            width: 50.0,
886            height: 12.0,
887            font_size: 10.0,
888            font_name: None,
889            is_bold: false,
890            is_italic: false,
891        };
892        assert_eq!(fragment.text, "Hello");
893        assert_eq!(fragment.x, 100.0);
894        assert_eq!(fragment.y, 200.0);
895        assert_eq!(fragment.width, 50.0);
896        assert_eq!(fragment.height, 12.0);
897        assert_eq!(fragment.font_size, 10.0);
898    }
899
900    #[test]
901    fn test_extracted_text() {
902        let fragments = vec![
903            TextFragment {
904                text: "Hello".to_string(),
905                x: 100.0,
906                y: 200.0,
907                width: 50.0,
908                height: 12.0,
909                font_size: 10.0,
910                font_name: None,
911                is_bold: false,
912                is_italic: false,
913            },
914            TextFragment {
915                text: "World".to_string(),
916                x: 160.0,
917                y: 200.0,
918                width: 50.0,
919                height: 12.0,
920                font_size: 10.0,
921                font_name: None,
922                is_bold: false,
923                is_italic: false,
924            },
925        ];
926
927        let extracted = ExtractedText {
928            text: "Hello World".to_string(),
929            fragments: fragments.clone(),
930        };
931
932        assert_eq!(extracted.text, "Hello World");
933        assert_eq!(extracted.fragments.len(), 2);
934        assert_eq!(extracted.fragments[0].text, "Hello");
935        assert_eq!(extracted.fragments[1].text, "World");
936    }
937
938    #[test]
939    fn test_text_state_default() {
940        let state = TextState::default();
941        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
942        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
943        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
944        assert_eq!(state.leading, 0.0);
945        assert_eq!(state.char_space, 0.0);
946        assert_eq!(state.word_space, 0.0);
947        assert_eq!(state.horizontal_scale, 100.0);
948        assert_eq!(state.text_rise, 0.0);
949        assert_eq!(state.font_size, 0.0);
950        assert!(state.font_name.is_none());
951        assert_eq!(state.render_mode, 0);
952    }
953
954    #[test]
955    fn test_matrix_operations() {
956        // Test rotation matrix
957        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
958        let (x, y) = transform_point(1.0, 0.0, &rotation);
959        assert_eq!(x, 0.0);
960        assert_eq!(y, 1.0);
961
962        // Test scaling matrix
963        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
964        let (x, y) = transform_point(5.0, 5.0, &scale);
965        assert_eq!(x, 10.0);
966        assert_eq!(y, 15.0);
967
968        // Test complex transformation
969        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
970        let (x, y) = transform_point(1.0, 1.0, &complex);
971        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
972        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
973    }
974
975    #[test]
976    fn test_text_extractor_new() {
977        let extractor = TextExtractor::new();
978        let options = extractor.options;
979        assert!(!options.preserve_layout);
980        assert_eq!(options.space_threshold, 0.2);
981        assert_eq!(options.newline_threshold, 10.0);
982        assert!(options.sort_by_position);
983        assert!(!options.detect_columns);
984        assert_eq!(options.column_threshold, 50.0);
985        assert!(options.merge_hyphenated);
986    }
987
988    #[test]
989    fn test_text_extractor_with_options() {
990        let options = ExtractionOptions {
991            preserve_layout: true,
992            space_threshold: 0.3,
993            newline_threshold: 12.0,
994            sort_by_position: false,
995            detect_columns: true,
996            column_threshold: 60.0,
997            merge_hyphenated: false,
998        };
999        let extractor = TextExtractor::with_options(options.clone());
1000        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1001        assert_eq!(extractor.options.space_threshold, options.space_threshold);
1002        assert_eq!(
1003            extractor.options.newline_threshold,
1004            options.newline_threshold
1005        );
1006        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1007        assert_eq!(extractor.options.detect_columns, options.detect_columns);
1008        assert_eq!(extractor.options.column_threshold, options.column_threshold);
1009        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1010    }
1011
1012    // =========================================================================
1013    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
1014    // =========================================================================
1015
1016    #[test]
1017    fn test_calculate_text_width_with_no_font_info() {
1018        // Test fallback: should use simplified calculation
1019        let width = calculate_text_width("Hello", 12.0, None);
1020
1021        // Expected: 5 chars * 12.0 * 0.5 = 30.0
1022        assert_eq!(
1023            width, 30.0,
1024            "Without font info, should use simplified calculation: len * font_size * 0.5"
1025        );
1026    }
1027
1028    #[test]
1029    fn test_calculate_text_width_with_empty_metrics() {
1030        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1031
1032        // Font with no widths array
1033        let font_info = FontInfo {
1034            name: "TestFont".to_string(),
1035            font_type: "Type1".to_string(),
1036            encoding: None,
1037            to_unicode: None,
1038            differences: None,
1039            descendant_font: None,
1040            cid_to_gid_map: None,
1041            metrics: FontMetrics {
1042                first_char: None,
1043                last_char: None,
1044                widths: None,
1045                missing_width: Some(500.0),
1046                kerning: None,
1047            },
1048        };
1049
1050        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1051
1052        // Should fall back to simplified calculation
1053        assert_eq!(
1054            width, 30.0,
1055            "Without widths array, should fall back to simplified calculation"
1056        );
1057    }
1058
1059    #[test]
1060    fn test_calculate_text_width_with_complete_metrics() {
1061        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1062
1063        // Font with complete metrics for ASCII range 32-126
1064        // Simulate typical Helvetica widths (in 1/1000 units)
1065        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
1066
1067        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
1068        widths[72 - 32] = 722.0; // 'H' is ASCII 72
1069        widths[101 - 32] = 556.0; // 'e' is ASCII 101
1070        widths[108 - 32] = 278.0; // 'l' is ASCII 108
1071        widths[111 - 32] = 611.0; // 'o' is ASCII 111
1072
1073        let font_info = FontInfo {
1074            name: "Helvetica".to_string(),
1075            font_type: "Type1".to_string(),
1076            encoding: None,
1077            to_unicode: None,
1078            differences: None,
1079            descendant_font: None,
1080            cid_to_gid_map: None,
1081            metrics: FontMetrics {
1082                first_char: Some(32),
1083                last_char: Some(126),
1084                widths: Some(widths),
1085                missing_width: Some(500.0),
1086                kerning: None,
1087            },
1088        };
1089
1090        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1091
1092        // Expected calculation (widths in glyph space / 1000 * font_size):
1093        // H: 722/1000 * 12 = 8.664
1094        // e: 556/1000 * 12 = 6.672
1095        // l: 278/1000 * 12 = 3.336
1096        // l: 278/1000 * 12 = 3.336
1097        // o: 611/1000 * 12 = 7.332
1098        // Total: 29.34
1099        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1100        let tolerance = 0.0001; // Floating point tolerance
1101        assert!(
1102            (width - expected).abs() < tolerance,
1103            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1104            expected,
1105            width,
1106            (width - expected).abs()
1107        );
1108
1109        // Verify it's different from simplified calculation
1110        let simplified = 5.0 * 12.0 * 0.5; // 30.0
1111        assert_ne!(
1112            width, simplified,
1113            "Metrics-based calculation should differ from simplified (30.0)"
1114        );
1115    }
1116
1117    #[test]
1118    fn test_calculate_text_width_character_outside_range() {
1119        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1120
1121        // Font with narrow range (only covers 'A'-'Z')
1122        let widths = vec![722.0; 26]; // All uppercase letters same width
1123
1124        let font_info = FontInfo {
1125            name: "TestFont".to_string(),
1126            font_type: "Type1".to_string(),
1127            encoding: None,
1128            to_unicode: None,
1129            differences: None,
1130            descendant_font: None,
1131            cid_to_gid_map: None,
1132            metrics: FontMetrics {
1133                first_char: Some(65), // 'A'
1134                last_char: Some(90),  // 'Z'
1135                widths: Some(widths),
1136                missing_width: Some(500.0),
1137                kerning: None,
1138            },
1139        };
1140
1141        // Test with character outside range
1142        let width = calculate_text_width("A1", 10.0, Some(&font_info));
1143
1144        // Expected:
1145        // 'A' (65) is in range: 722/1000 * 10 = 7.22
1146        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
1147        // Total: 12.22
1148        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1149        assert_eq!(
1150            width, expected,
1151            "Should use missing_width for characters outside range"
1152        );
1153    }
1154
1155    #[test]
1156    fn test_calculate_text_width_missing_width_in_array() {
1157        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1158
1159        // Font with incomplete widths array (some characters have 0.0)
1160        let mut widths = vec![500.0; 95]; // Default width
1161        widths[10] = 0.0; // Character at index 10 has no width defined
1162
1163        let font_info = FontInfo {
1164            name: "TestFont".to_string(),
1165            font_type: "Type1".to_string(),
1166            encoding: None,
1167            to_unicode: None,
1168            differences: None,
1169            descendant_font: None,
1170            cid_to_gid_map: None,
1171            metrics: FontMetrics {
1172                first_char: Some(32),
1173                last_char: Some(126),
1174                widths: Some(widths),
1175                missing_width: Some(600.0),
1176                kerning: None,
1177            },
1178        };
1179
1180        // Character 42 (index 10 from first_char 32)
1181        let char_code = 42u8 as char; // '*'
1182        let text = char_code.to_string();
1183        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1184
1185        // Character is in range but width is 0.0, should NOT fall back to missing_width
1186        // (0.0 is a valid width for zero-width characters)
1187        assert_eq!(
1188            width, 0.0,
1189            "Should use 0.0 width from array, not missing_width"
1190        );
1191    }
1192
1193    #[test]
1194    fn test_calculate_text_width_empty_string() {
1195        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1196
1197        let font_info = FontInfo {
1198            name: "TestFont".to_string(),
1199            font_type: "Type1".to_string(),
1200            encoding: None,
1201            to_unicode: None,
1202            differences: None,
1203            descendant_font: None,
1204            cid_to_gid_map: None,
1205            metrics: FontMetrics {
1206                first_char: Some(32),
1207                last_char: Some(126),
1208                widths: Some(vec![500.0; 95]),
1209                missing_width: Some(500.0),
1210                kerning: None,
1211            },
1212        };
1213
1214        let width = calculate_text_width("", 12.0, Some(&font_info));
1215        assert_eq!(width, 0.0, "Empty string should have zero width");
1216
1217        // Also test without font info
1218        let width_no_font = calculate_text_width("", 12.0, None);
1219        assert_eq!(
1220            width_no_font, 0.0,
1221            "Empty string should have zero width (no font)"
1222        );
1223    }
1224
1225    #[test]
1226    fn test_calculate_text_width_unicode_characters() {
1227        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1228
1229        // Font with limited ASCII range
1230        let font_info = FontInfo {
1231            name: "TestFont".to_string(),
1232            font_type: "Type1".to_string(),
1233            encoding: None,
1234            to_unicode: None,
1235            differences: None,
1236            descendant_font: None,
1237            cid_to_gid_map: None,
1238            metrics: FontMetrics {
1239                first_char: Some(32),
1240                last_char: Some(126),
1241                widths: Some(vec![500.0; 95]),
1242                missing_width: Some(600.0),
1243                kerning: None,
1244            },
1245        };
1246
1247        // Test with Unicode characters outside ASCII range
1248        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1249
1250        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1251        // Expected: 600/1000 * 10 = 6.0
1252        assert_eq!(
1253            width, 6.0,
1254            "Unicode character outside range should use missing_width"
1255        );
1256    }
1257
1258    #[test]
1259    fn test_calculate_text_width_different_font_sizes() {
1260        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1261
1262        let font_info = FontInfo {
1263            name: "TestFont".to_string(),
1264            font_type: "Type1".to_string(),
1265            encoding: None,
1266            to_unicode: None,
1267            differences: None,
1268            descendant_font: None,
1269            cid_to_gid_map: None,
1270            metrics: FontMetrics {
1271                first_char: Some(65), // 'A'
1272                last_char: Some(65),  // 'A'
1273                widths: Some(vec![722.0]),
1274                missing_width: Some(500.0),
1275                kerning: None,
1276            },
1277        };
1278
1279        // Test same character with different font sizes
1280        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1281        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1282
1283        // Widths should scale linearly with font size
1284        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1285        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1286        assert_eq!(
1287            width_20,
1288            width_10 * 2.0,
1289            "Width should scale linearly with font size"
1290        );
1291    }
1292
1293    #[test]
1294    fn test_calculate_text_width_proportional_vs_monospace() {
1295        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1296
1297        // Simulate proportional font (different widths)
1298        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1299        let proportional_font = FontInfo {
1300            name: "Helvetica".to_string(),
1301            font_type: "Type1".to_string(),
1302            encoding: None,
1303            to_unicode: None,
1304            differences: None,
1305            descendant_font: None,
1306            cid_to_gid_map: None,
1307            metrics: FontMetrics {
1308                first_char: Some(105), // 'i'
1309                last_char: Some(107),  // covers i, j, k
1310                widths: Some(proportional_widths),
1311                missing_width: Some(500.0),
1312                kerning: None,
1313            },
1314        };
1315
1316        // Simulate monospace font (same width)
1317        let monospace_widths = vec![600.0, 600.0, 600.0];
1318        let monospace_font = FontInfo {
1319            name: "Courier".to_string(),
1320            font_type: "Type1".to_string(),
1321            encoding: None,
1322            to_unicode: None,
1323            differences: None,
1324            descendant_font: None,
1325            cid_to_gid_map: None,
1326            metrics: FontMetrics {
1327                first_char: Some(105),
1328                last_char: Some(107),
1329                widths: Some(monospace_widths),
1330                missing_width: Some(600.0),
1331                kerning: None,
1332            },
1333        };
1334
1335        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1336        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1337
1338        // Proportional 'i' should be narrower than monospace 'i'
1339        assert!(
1340            prop_width < mono_width,
1341            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1342            prop_width,
1343            mono_width
1344        );
1345    }
1346
1347    // =========================================================================
1348    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1349    // =========================================================================
1350
1351    #[test]
1352    fn test_calculate_text_width_with_kerning() {
1353        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1354        use std::collections::HashMap;
1355
1356        // Create a font with kerning pairs
1357        let mut widths = vec![500.0; 95]; // ASCII 32-126
1358        widths[65 - 32] = 722.0; // 'A'
1359        widths[86 - 32] = 722.0; // 'V'
1360        widths[87 - 32] = 944.0; // 'W'
1361
1362        let mut kerning = HashMap::new();
1363        // Typical kerning pairs (in FUnits, 1/1000)
1364        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1365        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1366
1367        let font_info = FontInfo {
1368            name: "Helvetica".to_string(),
1369            font_type: "Type1".to_string(),
1370            encoding: None,
1371            to_unicode: None,
1372            differences: None,
1373            descendant_font: None,
1374            cid_to_gid_map: None,
1375            metrics: FontMetrics {
1376                first_char: Some(32),
1377                last_char: Some(126),
1378                widths: Some(widths),
1379                missing_width: Some(500.0),
1380                kerning: Some(kerning),
1381            },
1382        };
1383
1384        // Test "AV" with kerning
1385        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1386        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1387        //         = 17.328 - 0.6 = 16.728
1388        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1389        let tolerance = 0.0001;
1390        assert!(
1391            (width_av - expected_av).abs() < tolerance,
1392            "AV with kerning: expected {}, got {}, diff {}",
1393            expected_av,
1394            width_av,
1395            (width_av - expected_av).abs()
1396        );
1397
1398        // Test "AW" with different kerning value
1399        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1400        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1401        //         = 19.992 - 0.48 = 19.512
1402        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1403        assert!(
1404            (width_aw - expected_aw).abs() < tolerance,
1405            "AW with kerning: expected {}, got {}, diff {}",
1406            expected_aw,
1407            width_aw,
1408            (width_aw - expected_aw).abs()
1409        );
1410
1411        // Test "VA" with NO kerning (pair not in HashMap)
1412        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1413        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1414        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1415        assert!(
1416            (width_va - expected_va).abs() < tolerance,
1417            "VA without kerning: expected {}, got {}, diff {}",
1418            expected_va,
1419            width_va,
1420            (width_va - expected_va).abs()
1421        );
1422
1423        // Verify kerning makes a measurable difference
1424        assert!(
1425            width_av < width_va,
1426            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1427            width_av,
1428            width_va
1429        );
1430    }
1431
1432    #[test]
1433    fn test_parse_truetype_kern_table_minimal() {
1434        use crate::text::extraction_cmap::parse_truetype_kern_table;
1435
1436        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1437        // Structure:
1438        // 1. Offset table (12 bytes)
1439        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1440        // 3. 'head' table data (54 bytes)
1441        // 4. 'kern' table data (30 bytes)
1442        // Total: 128 bytes
1443        let mut ttf_data = vec![
1444            // Offset table
1445            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1446            0x00, 0x02, // numTables: 2
1447            0x00, 0x20, // searchRange: 32
1448            0x00, 0x01, // entrySelector: 1
1449            0x00, 0x00, // rangeShift: 0
1450        ];
1451
1452        // Table directory entry 1: 'head' table
1453        ttf_data.extend_from_slice(b"head"); // tag
1454        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1455        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1456        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1457
1458        // Table directory entry 2: 'kern' table
1459        ttf_data.extend_from_slice(b"kern"); // tag
1460        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1461        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1462        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1463
1464        // 'head' table data (54 bytes of zeros - minimal valid head table)
1465        ttf_data.extend_from_slice(&[0u8; 54]);
1466
1467        // 'kern' table data (34 bytes)
1468        ttf_data.extend_from_slice(&[
1469            // Kern table header
1470            0x00, 0x00, // version: 0
1471            0x00, 0x01, // nTables: 1
1472            // Subtable header
1473            0x00, 0x00, // version: 0
1474            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1475            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1476            0x00, 0x02, // nPairs: 2
1477            0x00, 0x08, // searchRange: 8
1478            0x00, 0x00, // entrySelector: 0
1479            0x00, 0x04, // rangeShift: 4
1480            // Kerning pair 1: A + V → -50
1481            0x00, 0x41, // left glyph: 65 ('A')
1482            0x00, 0x56, // right glyph: 86 ('V')
1483            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1484            // Kerning pair 2: A + W → -40
1485            0x00, 0x41, // left glyph: 65 ('A')
1486            0x00, 0x57, // right glyph: 87 ('W')
1487            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1488        ]);
1489
1490        let result = parse_truetype_kern_table(&ttf_data);
1491        assert!(
1492            result.is_ok(),
1493            "Should parse minimal kern table successfully: {:?}",
1494            result.err()
1495        );
1496
1497        let kerning_map = result.unwrap();
1498        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1499
1500        // Verify pair 1: A + V → -50
1501        assert_eq!(
1502            kerning_map.get(&(65, 86)),
1503            Some(&-50.0),
1504            "Should have A+V kerning pair with value -50"
1505        );
1506
1507        // Verify pair 2: A + W → -40
1508        assert_eq!(
1509            kerning_map.get(&(65, 87)),
1510            Some(&-40.0),
1511            "Should have A+W kerning pair with value -40"
1512        );
1513    }
1514
1515    #[test]
1516    fn test_parse_kern_table_no_kern_table() {
1517        use crate::text::extraction_cmap::extract_truetype_kerning;
1518
1519        // TrueType font data WITHOUT a 'kern' table
1520        // Structure:
1521        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1522        // - Table directory: 1 entry for 'head' table (not 'kern')
1523        let ttf_data = vec![
1524            // Offset table
1525            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1526            0x00, 0x01, // numTables: 1
1527            0x00, 0x10, // searchRange: 16
1528            0x00, 0x00, // entrySelector: 0
1529            0x00, 0x00, // rangeShift: 0
1530            // Table directory entry: 'head' table (not 'kern')
1531            b'h', b'e', b'a', b'd', // tag: 'head'
1532            0x00, 0x00, 0x00, 0x00, // checksum
1533            0x00, 0x00, 0x00, 0x1C, // offset: 28
1534            0x00, 0x00, 0x00, 0x36, // length: 54
1535            // Mock 'head' table data (54 bytes of zeros)
1536            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1537            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1538            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1539            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1540        ];
1541
1542        let result = extract_truetype_kerning(&ttf_data);
1543        assert!(
1544            result.is_ok(),
1545            "Should gracefully handle missing kern table"
1546        );
1547
1548        let kerning_map = result.unwrap();
1549        assert!(
1550            kerning_map.is_empty(),
1551            "Should return empty HashMap when no kern table exists"
1552        );
1553    }
1554}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs