oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::objects::PdfObject;
9use crate::parser::page_tree::ParsedPage;
10use crate::parser::ParseResult;
11use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
12use std::collections::HashMap;
13use std::io::{Read, Seek};
14
15/// Text extraction options
16#[derive(Debug, Clone)]
17pub struct ExtractionOptions {
18    /// Preserve the original layout (spacing and positioning)
19    pub preserve_layout: bool,
20    /// Minimum space width to insert space character (in text space units)
21    pub space_threshold: f64,
22    /// Minimum vertical distance to insert newline (in text space units)
23    pub newline_threshold: f64,
24    /// Sort text fragments by position (useful for multi-column layouts)
25    pub sort_by_position: bool,
26    /// Detect and handle columns
27    pub detect_columns: bool,
28    /// Column separation threshold (in page units)
29    pub column_threshold: f64,
30    /// Merge hyphenated words at line ends
31    pub merge_hyphenated: bool,
32}
33
34impl Default for ExtractionOptions {
35    fn default() -> Self {
36        Self {
37            preserve_layout: false,
38            space_threshold: 0.2,
39            newline_threshold: 10.0,
40            sort_by_position: true,
41            detect_columns: false,
42            column_threshold: 50.0,
43            merge_hyphenated: true,
44        }
45    }
46}
47
48/// Extracted text with position information
49#[derive(Debug, Clone)]
50pub struct ExtractedText {
51    /// The extracted text content
52    pub text: String,
53    /// Text fragments with position information (if preserve_layout is true)
54    pub fragments: Vec<TextFragment>,
55}
56
57/// A fragment of text with position information
58#[derive(Debug, Clone)]
59pub struct TextFragment {
60    /// Text content
61    pub text: String,
62    /// X position in page coordinates
63    pub x: f64,
64    /// Y position in page coordinates
65    pub y: f64,
66    /// Width of the text
67    pub width: f64,
68    /// Height of the text
69    pub height: f64,
70    /// Font size
71    pub font_size: f64,
72}
73
74/// Text extraction state
75struct TextState {
76    /// Current text matrix
77    text_matrix: [f64; 6],
78    /// Current text line matrix
79    text_line_matrix: [f64; 6],
80    /// Current transformation matrix (CTM)
81    #[allow(dead_code)]
82    ctm: [f64; 6],
83    /// Text leading (line spacing)
84    leading: f64,
85    /// Character spacing
86    char_space: f64,
87    /// Word spacing
88    word_space: f64,
89    /// Horizontal scaling
90    horizontal_scale: f64,
91    /// Text rise
92    text_rise: f64,
93    /// Current font size
94    font_size: f64,
95    /// Current font name
96    font_name: Option<String>,
97    /// Render mode (0 = fill, 1 = stroke, etc.)
98    render_mode: u8,
99}
100
101impl Default for TextState {
102    fn default() -> Self {
103        Self {
104            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
105            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
106            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
107            leading: 0.0,
108            char_space: 0.0,
109            word_space: 0.0,
110            horizontal_scale: 100.0,
111            text_rise: 0.0,
112            font_size: 0.0,
113            font_name: None,
114            render_mode: 0,
115        }
116    }
117}
118
119/// Text extractor for PDF pages with CMap support
120pub struct TextExtractor {
121    options: ExtractionOptions,
122    /// Font cache for the current extraction
123    font_cache: HashMap<String, FontInfo>,
124}
125
126impl TextExtractor {
127    /// Create a new text extractor with default options
128    pub fn new() -> Self {
129        Self {
130            options: ExtractionOptions::default(),
131            font_cache: HashMap::new(),
132        }
133    }
134
135    /// Create a text extractor with custom options
136    pub fn with_options(options: ExtractionOptions) -> Self {
137        Self {
138            options,
139            font_cache: HashMap::new(),
140        }
141    }
142
143    /// Extract text from a PDF document
144    pub fn extract_from_document<R: Read + Seek>(
145        &mut self,
146        document: &PdfDocument<R>,
147    ) -> ParseResult<Vec<ExtractedText>> {
148        let page_count = document.page_count()?;
149        let mut results = Vec::new();
150
151        for i in 0..page_count {
152            let text = self.extract_from_page(document, i)?;
153            results.push(text);
154        }
155
156        Ok(results)
157    }
158
159    /// Extract text from a specific page
160    pub fn extract_from_page<R: Read + Seek>(
161        &mut self,
162        document: &PdfDocument<R>,
163        page_index: u32,
164    ) -> ParseResult<ExtractedText> {
165        // Get the page
166        let page = document.get_page(page_index)?;
167
168        // Extract font resources first
169        self.extract_font_resources(&page, document)?;
170
171        // Get content streams
172        let streams = page.content_streams_with_document(document)?;
173
174        let mut extracted_text = String::new();
175        let mut fragments = Vec::new();
176        let mut state = TextState::default();
177        let mut in_text_object = false;
178        let mut last_x = 0.0;
179        let mut last_y = 0.0;
180
181        // Process each content stream
182        for stream_data in streams {
183            let operations = match ContentParser::parse_content(&stream_data) {
184                Ok(ops) => ops,
185                Err(e) => {
186                    // Log the error but continue processing other streams
187                    eprintln!("Warning: Failed to parse content stream, skipping: {}", e);
188                    continue;
189                }
190            };
191
192            for op in operations {
193                match op {
194                    ContentOperation::BeginText => {
195                        in_text_object = true;
196                        // Reset text matrix to identity
197                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
198                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
199                    }
200
201                    ContentOperation::EndText => {
202                        in_text_object = false;
203                    }
204
205                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
206                        state.text_matrix =
207                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
208                        state.text_line_matrix =
209                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
210                    }
211
212                    ContentOperation::MoveText(tx, ty) => {
213                        // Update text matrix by translation
214                        let new_matrix = multiply_matrix(
215                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
216                            &state.text_line_matrix,
217                        );
218                        state.text_matrix = new_matrix;
219                        state.text_line_matrix = new_matrix;
220                    }
221
222                    ContentOperation::NextLine => {
223                        // Move to next line using current leading
224                        let new_matrix = multiply_matrix(
225                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
226                            &state.text_line_matrix,
227                        );
228                        state.text_matrix = new_matrix;
229                        state.text_line_matrix = new_matrix;
230                    }
231
232                    ContentOperation::ShowText(text) => {
233                        if in_text_object {
234                            let text_bytes = &text;
235                            let decoded = self.decode_text(text_bytes, &state)?;
236
237                            // Calculate position
238                            let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
239
240                            // Add spacing based on position change
241                            if !extracted_text.is_empty() {
242                                let dx = x - last_x;
243                                let dy = (y - last_y).abs();
244
245                                if dy > self.options.newline_threshold {
246                                    extracted_text.push('\n');
247                                } else if dx > self.options.space_threshold * state.font_size {
248                                    extracted_text.push(' ');
249                                }
250                            }
251
252                            extracted_text.push_str(&decoded);
253
254                            // Get font info for accurate width calculation
255                            let font_info = state
256                                .font_name
257                                .as_ref()
258                                .and_then(|name| self.font_cache.get(name));
259
260                            if self.options.preserve_layout {
261                                fragments.push(TextFragment {
262                                    text: decoded.clone(),
263                                    x,
264                                    y,
265                                    width: calculate_text_width(
266                                        &decoded,
267                                        state.font_size,
268                                        font_info,
269                                    ),
270                                    height: state.font_size,
271                                    font_size: state.font_size,
272                                });
273                            }
274
275                            // Update position for next text
276                            last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
277                            last_y = y;
278
279                            // Update text matrix for next show operation
280                            let text_width =
281                                calculate_text_width(&decoded, state.font_size, font_info);
282                            let tx = text_width * state.horizontal_scale / 100.0;
283                            state.text_matrix =
284                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
285                        }
286                    }
287
288                    ContentOperation::ShowTextArray(array) => {
289                        if in_text_object {
290                            // Get font info for accurate width calculation
291                            let font_info = state
292                                .font_name
293                                .as_ref()
294                                .and_then(|name| self.font_cache.get(name));
295
296                            for item in array {
297                                match item {
298                                    TextElement::Text(text_bytes) => {
299                                        let decoded = self.decode_text(&text_bytes, &state)?;
300                                        extracted_text.push_str(&decoded);
301
302                                        // Update text matrix
303                                        let text_width = calculate_text_width(
304                                            &decoded,
305                                            state.font_size,
306                                            font_info,
307                                        );
308                                        let tx = text_width * state.horizontal_scale / 100.0;
309                                        state.text_matrix = multiply_matrix(
310                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
311                                            &state.text_matrix,
312                                        );
313                                    }
314                                    TextElement::Spacing(adjustment) => {
315                                        // Text position adjustment (negative = move left)
316                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
317                                        state.text_matrix = multiply_matrix(
318                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
319                                            &state.text_matrix,
320                                        );
321                                    }
322                                }
323                            }
324                        }
325                    }
326
327                    ContentOperation::SetFont(name, size) => {
328                        state.font_name = Some(name);
329                        state.font_size = size as f64;
330                    }
331
332                    ContentOperation::SetLeading(leading) => {
333                        state.leading = leading as f64;
334                    }
335
336                    ContentOperation::SetCharSpacing(spacing) => {
337                        state.char_space = spacing as f64;
338                    }
339
340                    ContentOperation::SetWordSpacing(spacing) => {
341                        state.word_space = spacing as f64;
342                    }
343
344                    ContentOperation::SetHorizontalScaling(scale) => {
345                        state.horizontal_scale = scale as f64;
346                    }
347
348                    ContentOperation::SetTextRise(rise) => {
349                        state.text_rise = rise as f64;
350                    }
351
352                    ContentOperation::SetTextRenderMode(mode) => {
353                        state.render_mode = mode as u8;
354                    }
355
356                    _ => {
357                        // Other operations don't affect text extraction
358                    }
359                }
360            }
361        }
362
363        // Sort and process fragments if requested
364        if self.options.sort_by_position && !fragments.is_empty() {
365            self.sort_and_merge_fragments(&mut fragments);
366        }
367
368        // Reconstruct text from sorted fragments if layout is preserved
369        if self.options.preserve_layout && !fragments.is_empty() {
370            extracted_text = self.reconstruct_text_from_fragments(&fragments);
371        }
372
373        Ok(ExtractedText {
374            text: extracted_text,
375            fragments,
376        })
377    }
378
379    /// Sort text fragments by position and merge them appropriately
380    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
381        // Sort fragments by Y position (top to bottom) then X position (left to right)
382        fragments.sort_by(|a, b| {
383            // First compare Y position (with threshold for same line)
384            let y_diff = (b.y - a.y).abs();
385            if y_diff < self.options.newline_threshold {
386                // Same line, sort by X position
387                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
388            } else {
389                // Different lines, sort by Y (inverted because PDF Y increases upward)
390                b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
391            }
392        });
393
394        // Detect columns if requested
395        if self.options.detect_columns {
396            self.detect_and_sort_columns(fragments);
397        }
398    }
399
400    /// Detect columns and re-sort fragments accordingly
401    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
402        // Group fragments by approximate Y position
403        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
404        let mut current_line: Vec<&mut TextFragment> = Vec::new();
405        let mut last_y = f64::INFINITY;
406
407        for fragment in fragments.iter_mut() {
408            let fragment_y = fragment.y;
409            if (last_y - fragment_y).abs() > self.options.newline_threshold
410                && !current_line.is_empty()
411            {
412                lines.push(current_line);
413                current_line = Vec::new();
414            }
415            current_line.push(fragment);
416            last_y = fragment_y;
417        }
418        if !current_line.is_empty() {
419            lines.push(current_line);
420        }
421
422        // Detect column boundaries
423        let mut column_boundaries = vec![0.0];
424        for line in &lines {
425            if line.len() > 1 {
426                for i in 0..line.len() - 1 {
427                    let gap = line[i + 1].x - (line[i].x + line[i].width);
428                    if gap > self.options.column_threshold {
429                        let boundary = line[i].x + line[i].width + gap / 2.0;
430                        if !column_boundaries
431                            .iter()
432                            .any(|&b| (b - boundary).abs() < 10.0)
433                        {
434                            column_boundaries.push(boundary);
435                        }
436                    }
437                }
438            }
439        }
440        column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
441
442        // Re-sort fragments by column then Y position
443        if column_boundaries.len() > 1 {
444            fragments.sort_by(|a, b| {
445                // Determine column for each fragment
446                let col_a = column_boundaries
447                    .iter()
448                    .position(|&boundary| a.x < boundary)
449                    .unwrap_or(column_boundaries.len())
450                    - 1;
451                let col_b = column_boundaries
452                    .iter()
453                    .position(|&boundary| b.x < boundary)
454                    .unwrap_or(column_boundaries.len())
455                    - 1;
456
457                if col_a != col_b {
458                    col_a.cmp(&col_b)
459                } else {
460                    // Same column, sort by Y position
461                    b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
462                }
463            });
464        }
465    }
466
467    /// Reconstruct text from sorted fragments
468    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
469        let mut result = String::new();
470        let mut last_y = f64::INFINITY;
471        let mut last_x = 0.0;
472        let mut last_line_ended_with_hyphen = false;
473
474        for fragment in fragments {
475            // Check if we need a newline
476            let y_diff = (last_y - fragment.y).abs();
477            if !result.is_empty() && y_diff > self.options.newline_threshold {
478                // Handle hyphenation
479                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
480                    // Remove the hyphen and don't add newline
481                    if result.ends_with('-') {
482                        result.pop();
483                    }
484                } else {
485                    result.push('\n');
486                }
487            } else if !result.is_empty() {
488                // Check if we need a space
489                let x_gap = fragment.x - last_x;
490                if x_gap > self.options.space_threshold * fragment.font_size {
491                    result.push(' ');
492                }
493            }
494
495            result.push_str(&fragment.text);
496            last_line_ended_with_hyphen = fragment.text.ends_with('-');
497            last_y = fragment.y;
498            last_x = fragment.x + fragment.width;
499        }
500
501        result
502    }
503
504    /// Extract font resources from page
505    fn extract_font_resources<R: Read + Seek>(
506        &mut self,
507        page: &ParsedPage,
508        document: &PdfDocument<R>,
509    ) -> ParseResult<()> {
510        // Clear previous font cache
511        self.font_cache.clear();
512
513        // Get page resources
514        if let Some(resources) = page.get_resources() {
515            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
516                // Extract each font
517                for (font_name, font_obj) in font_dict.0.iter() {
518                    if let Some(font_ref) = font_obj.as_reference() {
519                        if let Ok(PdfObject::Dictionary(font_dict)) =
520                            document.get_object(font_ref.0, font_ref.1)
521                        {
522                            // Create a CMap extractor to use its font extraction logic
523                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
524
525                            if let Ok(font_info) =
526                                cmap_extractor.extract_font_info(&font_dict, document)
527                            {
528                                self.font_cache.insert(font_name.0.clone(), font_info);
529                                tracing::debug!(
530                                    "Cached font: {} -> {:?}",
531                                    font_name.0,
532                                    self.font_cache.get(&font_name.0)
533                                );
534                            }
535                        }
536                    }
537                }
538            }
539        }
540
541        Ok(())
542    }
543
544    /// Decode text using the current font encoding and ToUnicode mapping
545    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
546        use crate::text::encoding::TextEncoding;
547
548        // First, try to use cached font information with ToUnicode CMap
549        if let Some(ref font_name) = state.font_name {
550            if let Some(font_info) = self.font_cache.get(font_name) {
551                // Create a temporary CMapTextExtractor to use its decoding logic
552                let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
553
554                // Try CMap-based decoding first
555                if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
556                    tracing::debug!(
557                        "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
558                        font_name,
559                        text,
560                        decoded
561                    );
562                    return Ok(decoded);
563                }
564
565                tracing::debug!(
566                    "CMap decoding failed for font {}, falling back to encoding",
567                    font_name
568                );
569            }
570        }
571
572        // Fall back to encoding-based decoding
573        let encoding = if let Some(ref font_name) = state.font_name {
574            match font_name.to_lowercase().as_str() {
575                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
576                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
577                name if name.contains("standard") => TextEncoding::StandardEncoding,
578                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
579                _ => {
580                    // Default based on common patterns
581                    if font_name.starts_with("Times")
582                        || font_name.starts_with("Helvetica")
583                        || font_name.starts_with("Courier")
584                    {
585                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
586                    } else {
587                        TextEncoding::PdfDocEncoding // Safe default
588                    }
589                }
590            }
591        } else {
592            TextEncoding::WinAnsiEncoding // Default for most PDFs
593        };
594
595        let fallback_result = encoding.decode(text);
596        tracing::debug!(
597            "Fallback encoding decoding: {:?} -> \"{}\"",
598            text,
599            fallback_result
600        );
601        Ok(fallback_result)
602    }
603}
604
605impl Default for TextExtractor {
606    fn default() -> Self {
607        Self::new()
608    }
609}
610
611/// Multiply two transformation matrices
612fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
613    [
614        a[0] * b[0] + a[1] * b[2],
615        a[0] * b[1] + a[1] * b[3],
616        a[2] * b[0] + a[3] * b[2],
617        a[2] * b[1] + a[3] * b[3],
618        a[4] * b[0] + a[5] * b[2] + b[4],
619        a[4] * b[1] + a[5] * b[3] + b[5],
620    ]
621}
622
623/// Transform a point using a transformation matrix
624fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
625    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
626    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
627    (tx, ty)
628}
629
630/// Calculate text width using actual font metrics (including kerning)
631fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
632    // If we have font metrics, use them for accurate width calculation
633    if let Some(font) = font_info {
634        if let Some(ref widths) = font.metrics.widths {
635            let first_char = font.metrics.first_char.unwrap_or(0);
636            let last_char = font.metrics.last_char.unwrap_or(255);
637            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
638
639            let mut total_width = 0.0;
640            let chars: Vec<char> = text.chars().collect();
641
642            for (i, &ch) in chars.iter().enumerate() {
643                let char_code = ch as u32;
644
645                // Get width from Widths array or use missing_width
646                let width = if char_code >= first_char && char_code <= last_char {
647                    let index = (char_code - first_char) as usize;
648                    widths.get(index).copied().unwrap_or(missing_width)
649                } else {
650                    missing_width
651                };
652
653                // Convert from glyph space (1/1000 units) to user space
654                total_width += width / 1000.0 * font_size;
655
656                // Apply kerning if available (for character pairs)
657                if let Some(ref kerning) = font.metrics.kerning {
658                    if i + 1 < chars.len() {
659                        let next_char = chars[i + 1] as u32;
660                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
661                            // Kerning is in FUnits (1/1000), convert to user space
662                            total_width += kern_value / 1000.0 * font_size;
663                        }
664                    }
665                }
666            }
667
668            return total_width;
669        }
670    }
671
672    // Fallback to simplified calculation if no metrics available
673    text.len() as f64 * font_size * 0.5
674}
675
676#[cfg(test)]
677mod tests {
678    use super::*;
679
680    #[test]
681    fn test_matrix_multiplication() {
682        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
683        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
684
685        let result = multiply_matrix(&identity, &translation);
686        assert_eq!(result, translation);
687
688        let result2 = multiply_matrix(&translation, &identity);
689        assert_eq!(result2, translation);
690    }
691
692    #[test]
693    fn test_transform_point() {
694        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
695        let (x, y) = transform_point(5.0, 5.0, &translation);
696        assert_eq!(x, 15.0);
697        assert_eq!(y, 25.0);
698    }
699
700    #[test]
701    fn test_extraction_options_default() {
702        let options = ExtractionOptions::default();
703        assert!(!options.preserve_layout);
704        assert_eq!(options.space_threshold, 0.2);
705        assert_eq!(options.newline_threshold, 10.0);
706        assert!(options.sort_by_position);
707        assert!(!options.detect_columns);
708        assert_eq!(options.column_threshold, 50.0);
709        assert!(options.merge_hyphenated);
710    }
711
712    #[test]
713    fn test_extraction_options_custom() {
714        let options = ExtractionOptions {
715            preserve_layout: true,
716            space_threshold: 0.5,
717            newline_threshold: 15.0,
718            sort_by_position: false,
719            detect_columns: true,
720            column_threshold: 75.0,
721            merge_hyphenated: false,
722        };
723        assert!(options.preserve_layout);
724        assert_eq!(options.space_threshold, 0.5);
725        assert_eq!(options.newline_threshold, 15.0);
726        assert!(!options.sort_by_position);
727        assert!(options.detect_columns);
728        assert_eq!(options.column_threshold, 75.0);
729        assert!(!options.merge_hyphenated);
730    }
731
732    #[test]
733    fn test_text_fragment() {
734        let fragment = TextFragment {
735            text: "Hello".to_string(),
736            x: 100.0,
737            y: 200.0,
738            width: 50.0,
739            height: 12.0,
740            font_size: 10.0,
741        };
742        assert_eq!(fragment.text, "Hello");
743        assert_eq!(fragment.x, 100.0);
744        assert_eq!(fragment.y, 200.0);
745        assert_eq!(fragment.width, 50.0);
746        assert_eq!(fragment.height, 12.0);
747        assert_eq!(fragment.font_size, 10.0);
748    }
749
750    #[test]
751    fn test_extracted_text() {
752        let fragments = vec![
753            TextFragment {
754                text: "Hello".to_string(),
755                x: 100.0,
756                y: 200.0,
757                width: 50.0,
758                height: 12.0,
759                font_size: 10.0,
760            },
761            TextFragment {
762                text: "World".to_string(),
763                x: 160.0,
764                y: 200.0,
765                width: 50.0,
766                height: 12.0,
767                font_size: 10.0,
768            },
769        ];
770
771        let extracted = ExtractedText {
772            text: "Hello World".to_string(),
773            fragments: fragments.clone(),
774        };
775
776        assert_eq!(extracted.text, "Hello World");
777        assert_eq!(extracted.fragments.len(), 2);
778        assert_eq!(extracted.fragments[0].text, "Hello");
779        assert_eq!(extracted.fragments[1].text, "World");
780    }
781
782    #[test]
783    fn test_text_state_default() {
784        let state = TextState::default();
785        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
786        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
787        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
788        assert_eq!(state.leading, 0.0);
789        assert_eq!(state.char_space, 0.0);
790        assert_eq!(state.word_space, 0.0);
791        assert_eq!(state.horizontal_scale, 100.0);
792        assert_eq!(state.text_rise, 0.0);
793        assert_eq!(state.font_size, 0.0);
794        assert!(state.font_name.is_none());
795        assert_eq!(state.render_mode, 0);
796    }
797
798    #[test]
799    fn test_matrix_operations() {
800        // Test rotation matrix
801        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
802        let (x, y) = transform_point(1.0, 0.0, &rotation);
803        assert_eq!(x, 0.0);
804        assert_eq!(y, 1.0);
805
806        // Test scaling matrix
807        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
808        let (x, y) = transform_point(5.0, 5.0, &scale);
809        assert_eq!(x, 10.0);
810        assert_eq!(y, 15.0);
811
812        // Test complex transformation
813        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
814        let (x, y) = transform_point(1.0, 1.0, &complex);
815        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
816        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
817    }
818
819    #[test]
820    fn test_text_extractor_new() {
821        let extractor = TextExtractor::new();
822        let options = extractor.options;
823        assert!(!options.preserve_layout);
824        assert_eq!(options.space_threshold, 0.2);
825        assert_eq!(options.newline_threshold, 10.0);
826        assert!(options.sort_by_position);
827        assert!(!options.detect_columns);
828        assert_eq!(options.column_threshold, 50.0);
829        assert!(options.merge_hyphenated);
830    }
831
832    #[test]
833    fn test_text_extractor_with_options() {
834        let options = ExtractionOptions {
835            preserve_layout: true,
836            space_threshold: 0.3,
837            newline_threshold: 12.0,
838            sort_by_position: false,
839            detect_columns: true,
840            column_threshold: 60.0,
841            merge_hyphenated: false,
842        };
843        let extractor = TextExtractor::with_options(options.clone());
844        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
845        assert_eq!(extractor.options.space_threshold, options.space_threshold);
846        assert_eq!(
847            extractor.options.newline_threshold,
848            options.newline_threshold
849        );
850        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
851        assert_eq!(extractor.options.detect_columns, options.detect_columns);
852        assert_eq!(extractor.options.column_threshold, options.column_threshold);
853        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
854    }
855
856    // =========================================================================
857    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
858    // =========================================================================
859
860    #[test]
861    fn test_calculate_text_width_with_no_font_info() {
862        // Test fallback: should use simplified calculation
863        let width = calculate_text_width("Hello", 12.0, None);
864
865        // Expected: 5 chars * 12.0 * 0.5 = 30.0
866        assert_eq!(
867            width, 30.0,
868            "Without font info, should use simplified calculation: len * font_size * 0.5"
869        );
870    }
871
872    #[test]
873    fn test_calculate_text_width_with_empty_metrics() {
874        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
875
876        // Font with no widths array
877        let font_info = FontInfo {
878            name: "TestFont".to_string(),
879            font_type: "Type1".to_string(),
880            encoding: None,
881            to_unicode: None,
882            differences: None,
883            descendant_font: None,
884            cid_to_gid_map: None,
885            metrics: FontMetrics {
886                first_char: None,
887                last_char: None,
888                widths: None,
889                missing_width: Some(500.0),
890                kerning: None,
891            },
892        };
893
894        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
895
896        // Should fall back to simplified calculation
897        assert_eq!(
898            width, 30.0,
899            "Without widths array, should fall back to simplified calculation"
900        );
901    }
902
903    #[test]
904    fn test_calculate_text_width_with_complete_metrics() {
905        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
906
907        // Font with complete metrics for ASCII range 32-126
908        // Simulate typical Helvetica widths (in 1/1000 units)
909        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
910
911        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
912        widths[72 - 32] = 722.0; // 'H' is ASCII 72
913        widths[101 - 32] = 556.0; // 'e' is ASCII 101
914        widths[108 - 32] = 278.0; // 'l' is ASCII 108
915        widths[111 - 32] = 611.0; // 'o' is ASCII 111
916
917        let font_info = FontInfo {
918            name: "Helvetica".to_string(),
919            font_type: "Type1".to_string(),
920            encoding: None,
921            to_unicode: None,
922            differences: None,
923            descendant_font: None,
924            cid_to_gid_map: None,
925            metrics: FontMetrics {
926                first_char: Some(32),
927                last_char: Some(126),
928                widths: Some(widths),
929                missing_width: Some(500.0),
930                kerning: None,
931            },
932        };
933
934        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
935
936        // Expected calculation (widths in glyph space / 1000 * font_size):
937        // H: 722/1000 * 12 = 8.664
938        // e: 556/1000 * 12 = 6.672
939        // l: 278/1000 * 12 = 3.336
940        // l: 278/1000 * 12 = 3.336
941        // o: 611/1000 * 12 = 7.332
942        // Total: 29.34
943        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
944        let tolerance = 0.0001; // Floating point tolerance
945        assert!(
946            (width - expected).abs() < tolerance,
947            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
948            expected,
949            width,
950            (width - expected).abs()
951        );
952
953        // Verify it's different from simplified calculation
954        let simplified = 5.0 * 12.0 * 0.5; // 30.0
955        assert_ne!(
956            width, simplified,
957            "Metrics-based calculation should differ from simplified (30.0)"
958        );
959    }
960
961    #[test]
962    fn test_calculate_text_width_character_outside_range() {
963        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
964
965        // Font with narrow range (only covers 'A'-'Z')
966        let widths = vec![722.0; 26]; // All uppercase letters same width
967
968        let font_info = FontInfo {
969            name: "TestFont".to_string(),
970            font_type: "Type1".to_string(),
971            encoding: None,
972            to_unicode: None,
973            differences: None,
974            descendant_font: None,
975            cid_to_gid_map: None,
976            metrics: FontMetrics {
977                first_char: Some(65), // 'A'
978                last_char: Some(90),  // 'Z'
979                widths: Some(widths),
980                missing_width: Some(500.0),
981                kerning: None,
982            },
983        };
984
985        // Test with character outside range
986        let width = calculate_text_width("A1", 10.0, Some(&font_info));
987
988        // Expected:
989        // 'A' (65) is in range: 722/1000 * 10 = 7.22
990        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
991        // Total: 12.22
992        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
993        assert_eq!(
994            width, expected,
995            "Should use missing_width for characters outside range"
996        );
997    }
998
999    #[test]
1000    fn test_calculate_text_width_missing_width_in_array() {
1001        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1002
1003        // Font with incomplete widths array (some characters have 0.0)
1004        let mut widths = vec![500.0; 95]; // Default width
1005        widths[10] = 0.0; // Character at index 10 has no width defined
1006
1007        let font_info = FontInfo {
1008            name: "TestFont".to_string(),
1009            font_type: "Type1".to_string(),
1010            encoding: None,
1011            to_unicode: None,
1012            differences: None,
1013            descendant_font: None,
1014            cid_to_gid_map: None,
1015            metrics: FontMetrics {
1016                first_char: Some(32),
1017                last_char: Some(126),
1018                widths: Some(widths),
1019                missing_width: Some(600.0),
1020                kerning: None,
1021            },
1022        };
1023
1024        // Character 42 (index 10 from first_char 32)
1025        let char_code = 42u8 as char; // '*'
1026        let text = char_code.to_string();
1027        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1028
1029        // Character is in range but width is 0.0, should NOT fall back to missing_width
1030        // (0.0 is a valid width for zero-width characters)
1031        assert_eq!(
1032            width, 0.0,
1033            "Should use 0.0 width from array, not missing_width"
1034        );
1035    }
1036
1037    #[test]
1038    fn test_calculate_text_width_empty_string() {
1039        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1040
1041        let font_info = FontInfo {
1042            name: "TestFont".to_string(),
1043            font_type: "Type1".to_string(),
1044            encoding: None,
1045            to_unicode: None,
1046            differences: None,
1047            descendant_font: None,
1048            cid_to_gid_map: None,
1049            metrics: FontMetrics {
1050                first_char: Some(32),
1051                last_char: Some(126),
1052                widths: Some(vec![500.0; 95]),
1053                missing_width: Some(500.0),
1054                kerning: None,
1055            },
1056        };
1057
1058        let width = calculate_text_width("", 12.0, Some(&font_info));
1059        assert_eq!(width, 0.0, "Empty string should have zero width");
1060
1061        // Also test without font info
1062        let width_no_font = calculate_text_width("", 12.0, None);
1063        assert_eq!(
1064            width_no_font, 0.0,
1065            "Empty string should have zero width (no font)"
1066        );
1067    }
1068
1069    #[test]
1070    fn test_calculate_text_width_unicode_characters() {
1071        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1072
1073        // Font with limited ASCII range
1074        let font_info = FontInfo {
1075            name: "TestFont".to_string(),
1076            font_type: "Type1".to_string(),
1077            encoding: None,
1078            to_unicode: None,
1079            differences: None,
1080            descendant_font: None,
1081            cid_to_gid_map: None,
1082            metrics: FontMetrics {
1083                first_char: Some(32),
1084                last_char: Some(126),
1085                widths: Some(vec![500.0; 95]),
1086                missing_width: Some(600.0),
1087                kerning: None,
1088            },
1089        };
1090
1091        // Test with Unicode characters outside ASCII range
1092        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1093
1094        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1095        // Expected: 600/1000 * 10 = 6.0
1096        assert_eq!(
1097            width, 6.0,
1098            "Unicode character outside range should use missing_width"
1099        );
1100    }
1101
1102    #[test]
1103    fn test_calculate_text_width_different_font_sizes() {
1104        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1105
1106        let font_info = FontInfo {
1107            name: "TestFont".to_string(),
1108            font_type: "Type1".to_string(),
1109            encoding: None,
1110            to_unicode: None,
1111            differences: None,
1112            descendant_font: None,
1113            cid_to_gid_map: None,
1114            metrics: FontMetrics {
1115                first_char: Some(65), // 'A'
1116                last_char: Some(65),  // 'A'
1117                widths: Some(vec![722.0]),
1118                missing_width: Some(500.0),
1119                kerning: None,
1120            },
1121        };
1122
1123        // Test same character with different font sizes
1124        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1125        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1126
1127        // Widths should scale linearly with font size
1128        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1129        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1130        assert_eq!(
1131            width_20,
1132            width_10 * 2.0,
1133            "Width should scale linearly with font size"
1134        );
1135    }
1136
1137    #[test]
1138    fn test_calculate_text_width_proportional_vs_monospace() {
1139        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1140
1141        // Simulate proportional font (different widths)
1142        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1143        let proportional_font = FontInfo {
1144            name: "Helvetica".to_string(),
1145            font_type: "Type1".to_string(),
1146            encoding: None,
1147            to_unicode: None,
1148            differences: None,
1149            descendant_font: None,
1150            cid_to_gid_map: None,
1151            metrics: FontMetrics {
1152                first_char: Some(105), // 'i'
1153                last_char: Some(107),  // covers i, j, k
1154                widths: Some(proportional_widths),
1155                missing_width: Some(500.0),
1156                kerning: None,
1157            },
1158        };
1159
1160        // Simulate monospace font (same width)
1161        let monospace_widths = vec![600.0, 600.0, 600.0];
1162        let monospace_font = FontInfo {
1163            name: "Courier".to_string(),
1164            font_type: "Type1".to_string(),
1165            encoding: None,
1166            to_unicode: None,
1167            differences: None,
1168            descendant_font: None,
1169            cid_to_gid_map: None,
1170            metrics: FontMetrics {
1171                first_char: Some(105),
1172                last_char: Some(107),
1173                widths: Some(monospace_widths),
1174                missing_width: Some(600.0),
1175                kerning: None,
1176            },
1177        };
1178
1179        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1180        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1181
1182        // Proportional 'i' should be narrower than monospace 'i'
1183        assert!(
1184            prop_width < mono_width,
1185            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1186            prop_width,
1187            mono_width
1188        );
1189    }
1190
1191    // =========================================================================
1192    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1193    // =========================================================================
1194
1195    #[test]
1196    fn test_calculate_text_width_with_kerning() {
1197        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1198        use std::collections::HashMap;
1199
1200        // Create a font with kerning pairs
1201        let mut widths = vec![500.0; 95]; // ASCII 32-126
1202        widths[65 - 32] = 722.0; // 'A'
1203        widths[86 - 32] = 722.0; // 'V'
1204        widths[87 - 32] = 944.0; // 'W'
1205
1206        let mut kerning = HashMap::new();
1207        // Typical kerning pairs (in FUnits, 1/1000)
1208        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1209        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1210
1211        let font_info = FontInfo {
1212            name: "Helvetica".to_string(),
1213            font_type: "Type1".to_string(),
1214            encoding: None,
1215            to_unicode: None,
1216            differences: None,
1217            descendant_font: None,
1218            cid_to_gid_map: None,
1219            metrics: FontMetrics {
1220                first_char: Some(32),
1221                last_char: Some(126),
1222                widths: Some(widths),
1223                missing_width: Some(500.0),
1224                kerning: Some(kerning),
1225            },
1226        };
1227
1228        // Test "AV" with kerning
1229        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1230        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1231        //         = 17.328 - 0.6 = 16.728
1232        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1233        let tolerance = 0.0001;
1234        assert!(
1235            (width_av - expected_av).abs() < tolerance,
1236            "AV with kerning: expected {}, got {}, diff {}",
1237            expected_av,
1238            width_av,
1239            (width_av - expected_av).abs()
1240        );
1241
1242        // Test "AW" with different kerning value
1243        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1244        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1245        //         = 19.992 - 0.48 = 19.512
1246        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1247        assert!(
1248            (width_aw - expected_aw).abs() < tolerance,
1249            "AW with kerning: expected {}, got {}, diff {}",
1250            expected_aw,
1251            width_aw,
1252            (width_aw - expected_aw).abs()
1253        );
1254
1255        // Test "VA" with NO kerning (pair not in HashMap)
1256        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1257        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1258        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1259        assert!(
1260            (width_va - expected_va).abs() < tolerance,
1261            "VA without kerning: expected {}, got {}, diff {}",
1262            expected_va,
1263            width_va,
1264            (width_va - expected_va).abs()
1265        );
1266
1267        // Verify kerning makes a measurable difference
1268        assert!(
1269            width_av < width_va,
1270            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1271            width_av,
1272            width_va
1273        );
1274    }
1275
1276    #[test]
1277    fn test_parse_truetype_kern_table_minimal() {
1278        use crate::text::extraction_cmap::parse_truetype_kern_table;
1279
1280        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1281        // Structure:
1282        // 1. Offset table (12 bytes)
1283        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1284        // 3. 'head' table data (54 bytes)
1285        // 4. 'kern' table data (30 bytes)
1286        // Total: 128 bytes
1287        let mut ttf_data = vec![
1288            // Offset table
1289            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1290            0x00, 0x02, // numTables: 2
1291            0x00, 0x20, // searchRange: 32
1292            0x00, 0x01, // entrySelector: 1
1293            0x00, 0x00, // rangeShift: 0
1294        ];
1295
1296        // Table directory entry 1: 'head' table
1297        ttf_data.extend_from_slice(b"head"); // tag
1298        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1299        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1300        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1301
1302        // Table directory entry 2: 'kern' table
1303        ttf_data.extend_from_slice(b"kern"); // tag
1304        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1305        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1306        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1307
1308        // 'head' table data (54 bytes of zeros - minimal valid head table)
1309        ttf_data.extend_from_slice(&[0u8; 54]);
1310
1311        // 'kern' table data (34 bytes)
1312        ttf_data.extend_from_slice(&[
1313            // Kern table header
1314            0x00, 0x00, // version: 0
1315            0x00, 0x01, // nTables: 1
1316            // Subtable header
1317            0x00, 0x00, // version: 0
1318            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1319            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1320            0x00, 0x02, // nPairs: 2
1321            0x00, 0x08, // searchRange: 8
1322            0x00, 0x00, // entrySelector: 0
1323            0x00, 0x04, // rangeShift: 4
1324            // Kerning pair 1: A + V → -50
1325            0x00, 0x41, // left glyph: 65 ('A')
1326            0x00, 0x56, // right glyph: 86 ('V')
1327            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1328            // Kerning pair 2: A + W → -40
1329            0x00, 0x41, // left glyph: 65 ('A')
1330            0x00, 0x57, // right glyph: 87 ('W')
1331            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1332        ]);
1333
1334        let result = parse_truetype_kern_table(&ttf_data);
1335        assert!(
1336            result.is_ok(),
1337            "Should parse minimal kern table successfully: {:?}",
1338            result.err()
1339        );
1340
1341        let kerning_map = result.unwrap();
1342        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1343
1344        // Verify pair 1: A + V → -50
1345        assert_eq!(
1346            kerning_map.get(&(65, 86)),
1347            Some(&-50.0),
1348            "Should have A+V kerning pair with value -50"
1349        );
1350
1351        // Verify pair 2: A + W → -40
1352        assert_eq!(
1353            kerning_map.get(&(65, 87)),
1354            Some(&-40.0),
1355            "Should have A+W kerning pair with value -40"
1356        );
1357    }
1358
1359    #[test]
1360    fn test_parse_kern_table_no_kern_table() {
1361        use crate::text::extraction_cmap::extract_truetype_kerning;
1362
1363        // TrueType font data WITHOUT a 'kern' table
1364        // Structure:
1365        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1366        // - Table directory: 1 entry for 'head' table (not 'kern')
1367        let ttf_data = vec![
1368            // Offset table
1369            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1370            0x00, 0x01, // numTables: 1
1371            0x00, 0x10, // searchRange: 16
1372            0x00, 0x00, // entrySelector: 0
1373            0x00, 0x00, // rangeShift: 0
1374            // Table directory entry: 'head' table (not 'kern')
1375            b'h', b'e', b'a', b'd', // tag: 'head'
1376            0x00, 0x00, 0x00, 0x00, // checksum
1377            0x00, 0x00, 0x00, 0x1C, // offset: 28
1378            0x00, 0x00, 0x00, 0x36, // length: 54
1379            // Mock 'head' table data (54 bytes of zeros)
1380            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1381            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1382            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1383            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1384        ];
1385
1386        let result = extract_truetype_kerning(&ttf_data);
1387        assert!(
1388            result.is_ok(),
1389            "Should gracefully handle missing kern table"
1390        );
1391
1392        let kerning_map = result.unwrap();
1393        assert!(
1394            kerning_map.is_empty(),
1395            "Should return empty HashMap when no kern table exists"
1396        );
1397    }
1398}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs