oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19    /// Preserve the original layout (spacing and positioning)
20    pub preserve_layout: bool,
21    /// Minimum space width to insert space character (in text space units)
22    pub space_threshold: f64,
23    /// Minimum vertical distance to insert newline (in text space units)
24    pub newline_threshold: f64,
25    /// Sort text fragments by position (useful for multi-column layouts)
26    pub sort_by_position: bool,
27    /// Detect and handle columns
28    pub detect_columns: bool,
29    /// Column separation threshold (in page units)
30    pub column_threshold: f64,
31    /// Merge hyphenated words at line ends
32    pub merge_hyphenated: bool,
33}
34
35impl Default for ExtractionOptions {
36    fn default() -> Self {
37        Self {
38            preserve_layout: false,
39            space_threshold: 0.3,
40            newline_threshold: 10.0,
41            sort_by_position: true,
42            detect_columns: false,
43            column_threshold: 50.0,
44            merge_hyphenated: true,
45        }
46    }
47}
48
49/// Extracted text with position information
50#[derive(Debug, Clone)]
51pub struct ExtractedText {
52    /// The extracted text content
53    pub text: String,
54    /// Text fragments with position information (if preserve_layout is true)
55    pub fragments: Vec<TextFragment>,
56}
57
58/// A fragment of text with position information
59#[derive(Debug, Clone)]
60pub struct TextFragment {
61    /// Text content
62    pub text: String,
63    /// X position in page coordinates
64    pub x: f64,
65    /// Y position in page coordinates
66    pub y: f64,
67    /// Width of the text
68    pub width: f64,
69    /// Height of the text
70    pub height: f64,
71    /// Font size
72    pub font_size: f64,
73    /// Font name (if known) - used for kerning-aware text spacing
74    pub font_name: Option<String>,
75    /// Whether the font is bold (detected from font name)
76    pub is_bold: bool,
77    /// Whether the font is italic (detected from font name)
78    pub is_italic: bool,
79    /// Fill color of the text (from graphics state)
80    pub color: Option<Color>,
81}
82
83/// Text extraction state
84struct TextState {
85    /// Current text matrix
86    text_matrix: [f64; 6],
87    /// Current text line matrix
88    text_line_matrix: [f64; 6],
89    /// Current transformation matrix (CTM)
90    ctm: [f64; 6],
91    /// Text leading (line spacing)
92    leading: f64,
93    /// Character spacing
94    char_space: f64,
95    /// Word spacing
96    word_space: f64,
97    /// Horizontal scaling
98    horizontal_scale: f64,
99    /// Text rise
100    text_rise: f64,
101    /// Current font size
102    font_size: f64,
103    /// Current font name
104    font_name: Option<String>,
105    /// Render mode (0 = fill, 1 = stroke, etc.)
106    render_mode: u8,
107    /// Fill color (for text rendering)
108    fill_color: Option<Color>,
109}
110
111impl Default for TextState {
112    fn default() -> Self {
113        Self {
114            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
115            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
116            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
117            leading: 0.0,
118            char_space: 0.0,
119            word_space: 0.0,
120            horizontal_scale: 100.0,
121            text_rise: 0.0,
122            font_size: 0.0,
123            font_name: None,
124            render_mode: 0,
125            fill_color: None,
126        }
127    }
128}
129
130/// Parse font style (bold/italic) from font name
131///
132/// Detects bold and italic styles from common font naming patterns.
133/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
134/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
135///
136/// # Examples
137///
138/// ```
139/// use oxidize_pdf::text::extraction::parse_font_style;
140///
141/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
142/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
143/// assert_eq!(parse_font_style("Courier"), (false, false));
144/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
145/// ```
146///
147/// # Returns
148///
149/// Tuple of (is_bold, is_italic)
150pub fn parse_font_style(font_name: &str) -> (bool, bool) {
151    let name_lower = font_name.to_lowercase();
152
153    // Detect bold from common patterns
154    let is_bold = name_lower.contains("bold")
155        || name_lower.contains("-b")
156        || name_lower.contains(" b ")
157        || name_lower.ends_with(" b");
158
159    // Detect italic/oblique from common patterns
160    let is_italic = name_lower.contains("italic")
161        || name_lower.contains("oblique")
162        || name_lower.contains("-i")
163        || name_lower.contains(" i ")
164        || name_lower.ends_with(" i");
165
166    (is_bold, is_italic)
167}
168
169/// Text extractor for PDF pages with CMap support
170pub struct TextExtractor {
171    options: ExtractionOptions,
172    /// Font cache for the current extraction
173    font_cache: HashMap<String, FontInfo>,
174}
175
176impl TextExtractor {
177    /// Create a new text extractor with default options
178    pub fn new() -> Self {
179        Self {
180            options: ExtractionOptions::default(),
181            font_cache: HashMap::new(),
182        }
183    }
184
185    /// Create a text extractor with custom options
186    pub fn with_options(options: ExtractionOptions) -> Self {
187        Self {
188            options,
189            font_cache: HashMap::new(),
190        }
191    }
192
193    /// Extract text from a PDF document
194    pub fn extract_from_document<R: Read + Seek>(
195        &mut self,
196        document: &PdfDocument<R>,
197    ) -> ParseResult<Vec<ExtractedText>> {
198        let page_count = document.page_count()?;
199        let mut results = Vec::new();
200
201        for i in 0..page_count {
202            let text = self.extract_from_page(document, i)?;
203            results.push(text);
204        }
205
206        Ok(results)
207    }
208
209    /// Extract text from a specific page
210    pub fn extract_from_page<R: Read + Seek>(
211        &mut self,
212        document: &PdfDocument<R>,
213        page_index: u32,
214    ) -> ParseResult<ExtractedText> {
215        // Get the page
216        let page = document.get_page(page_index)?;
217
218        // Extract font resources first
219        self.extract_font_resources(&page, document)?;
220
221        // Get content streams
222        let streams = page.content_streams_with_document(document)?;
223
224        let mut extracted_text = String::new();
225        let mut fragments = Vec::new();
226        let mut state = TextState::default();
227        let mut in_text_object = false;
228        let mut last_x = 0.0;
229        let mut last_y = 0.0;
230
231        // Process each content stream
232        for (stream_idx, stream_data) in streams.iter().enumerate() {
233            let operations = match ContentParser::parse_content(stream_data) {
234                Ok(ops) => ops,
235                Err(e) => {
236                    // Enhanced diagnostic logging for content stream parsing failures
237                    tracing::debug!(
238                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
239                        page_index + 1,
240                        stream_idx + 1,
241                        streams.len()
242                    );
243                    tracing::debug!("         Error: {}", e);
244                    tracing::debug!("         Stream size: {} bytes", stream_data.len());
245
246                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
247                    let preview_len = stream_data.len().min(100);
248                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
249                    tracing::debug!(
250                        "         Stream preview (first {} bytes): {:?}",
251                        preview_len,
252                        preview.chars().take(80).collect::<String>()
253                    );
254
255                    // Continue processing other streams
256                    continue;
257                }
258            };
259
260            for op in operations {
261                match op {
262                    ContentOperation::BeginText => {
263                        in_text_object = true;
264                        // Reset text matrix to identity
265                        state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
266                        state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
267                    }
268
269                    ContentOperation::EndText => {
270                        in_text_object = false;
271                    }
272
273                    ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
274                        state.text_matrix =
275                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
276                        state.text_line_matrix =
277                            [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
278                    }
279
280                    ContentOperation::MoveText(tx, ty) => {
281                        // Update text matrix by translation
282                        let new_matrix = multiply_matrix(
283                            &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
284                            &state.text_line_matrix,
285                        );
286                        state.text_matrix = new_matrix;
287                        state.text_line_matrix = new_matrix;
288                    }
289
290                    ContentOperation::NextLine => {
291                        // Move to next line using current leading
292                        let new_matrix = multiply_matrix(
293                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
294                            &state.text_line_matrix,
295                        );
296                        state.text_matrix = new_matrix;
297                        state.text_line_matrix = new_matrix;
298                    }
299
300                    ContentOperation::ShowText(text) => {
301                        if in_text_object {
302                            let text_bytes = &text;
303                            let decoded = self.decode_text(text_bytes, &state)?;
304
305                            // Calculate position: Apply text_matrix, then CTM
306                            // Concatenate: final = CTM × text_matrix
307                            let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
308                            let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
309
310                            // Add spacing based on position change
311                            if !extracted_text.is_empty() {
312                                let dx = x - last_x;
313                                let dy = (y - last_y).abs();
314
315                                if dy > self.options.newline_threshold {
316                                    extracted_text.push('\n');
317                                } else if dx > self.options.space_threshold * state.font_size {
318                                    extracted_text.push(' ');
319                                }
320                            }
321
322                            extracted_text.push_str(&decoded);
323
324                            // Get font info for accurate width calculation
325                            let font_info = state
326                                .font_name
327                                .as_ref()
328                                .and_then(|name| self.font_cache.get(name));
329
330                            if self.options.preserve_layout {
331                                // Detect bold/italic from font name
332                                let (is_bold, is_italic) = state
333                                    .font_name
334                                    .as_ref()
335                                    .map(|name| parse_font_style(name))
336                                    .unwrap_or((false, false));
337
338                                fragments.push(TextFragment {
339                                    text: decoded.clone(),
340                                    x,
341                                    y,
342                                    width: calculate_text_width(
343                                        &decoded,
344                                        state.font_size,
345                                        font_info,
346                                    ),
347                                    height: state.font_size,
348                                    font_size: state.font_size,
349                                    font_name: state.font_name.clone(),
350                                    is_bold,
351                                    is_italic,
352                                    color: state.fill_color,
353                                });
354                            }
355
356                            // Update position for next text
357                            last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
358                            last_y = y;
359
360                            // Update text matrix for next show operation
361                            let text_width =
362                                calculate_text_width(&decoded, state.font_size, font_info);
363                            let tx = text_width * state.horizontal_scale / 100.0;
364                            state.text_matrix =
365                                multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
366                        }
367                    }
368
369                    ContentOperation::ShowTextArray(array) => {
370                        if in_text_object {
371                            // Get font info for accurate width calculation
372                            let font_info = state
373                                .font_name
374                                .as_ref()
375                                .and_then(|name| self.font_cache.get(name));
376
377                            for item in array {
378                                match item {
379                                    TextElement::Text(text_bytes) => {
380                                        let decoded = self.decode_text(&text_bytes, &state)?;
381                                        extracted_text.push_str(&decoded);
382
383                                        // Update text matrix
384                                        let text_width = calculate_text_width(
385                                            &decoded,
386                                            state.font_size,
387                                            font_info,
388                                        );
389                                        let tx = text_width * state.horizontal_scale / 100.0;
390                                        state.text_matrix = multiply_matrix(
391                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
392                                            &state.text_matrix,
393                                        );
394                                    }
395                                    TextElement::Spacing(adjustment) => {
396                                        // Text position adjustment (negative = move left)
397                                        let tx = -(adjustment as f64) / 1000.0 * state.font_size;
398                                        state.text_matrix = multiply_matrix(
399                                            &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
400                                            &state.text_matrix,
401                                        );
402                                    }
403                                }
404                            }
405                        }
406                    }
407
408                    ContentOperation::SetFont(name, size) => {
409                        state.font_name = Some(name);
410                        state.font_size = size as f64;
411                    }
412
413                    ContentOperation::SetLeading(leading) => {
414                        state.leading = leading as f64;
415                    }
416
417                    ContentOperation::SetCharSpacing(spacing) => {
418                        state.char_space = spacing as f64;
419                    }
420
421                    ContentOperation::SetWordSpacing(spacing) => {
422                        state.word_space = spacing as f64;
423                    }
424
425                    ContentOperation::SetHorizontalScaling(scale) => {
426                        state.horizontal_scale = scale as f64;
427                    }
428
429                    ContentOperation::SetTextRise(rise) => {
430                        state.text_rise = rise as f64;
431                    }
432
433                    ContentOperation::SetTextRenderMode(mode) => {
434                        state.render_mode = mode as u8;
435                    }
436
437                    ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
438                        // Update CTM: new_ctm = concat_matrix * current_ctm
439                        let [a0, b0, c0, d0, e0, f0] = state.ctm;
440                        let a = a as f64;
441                        let b = b as f64;
442                        let c = c as f64;
443                        let d = d as f64;
444                        let e = e as f64;
445                        let f = f as f64;
446                        state.ctm = [
447                            a * a0 + b * c0,
448                            a * b0 + b * d0,
449                            c * a0 + d * c0,
450                            c * b0 + d * d0,
451                            e * a0 + f * c0 + e0,
452                            e * b0 + f * d0 + f0,
453                        ];
454                    }
455
456                    // Color operations (Phase 4: Color extraction)
457                    ContentOperation::SetNonStrokingGray(gray) => {
458                        state.fill_color = Some(Color::gray(gray as f64));
459                    }
460
461                    ContentOperation::SetNonStrokingRGB(r, g, b) => {
462                        state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
463                    }
464
465                    ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
466                        state.fill_color =
467                            Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
468                    }
469
470                    _ => {
471                        // Other operations don't affect text extraction
472                    }
473                }
474            }
475        }
476
477        // Sort and process fragments if requested
478        if self.options.sort_by_position && !fragments.is_empty() {
479            self.sort_and_merge_fragments(&mut fragments);
480        }
481
482        // Merge close fragments to eliminate spacing artifacts
483        // This is crucial for table detection and structured data extraction
484        if self.options.preserve_layout && !fragments.is_empty() {
485            fragments = self.merge_close_fragments(&fragments);
486        }
487
488        // Reconstruct text from sorted fragments if layout is preserved
489        if self.options.preserve_layout && !fragments.is_empty() {
490            extracted_text = self.reconstruct_text_from_fragments(&fragments);
491        }
492
493        Ok(ExtractedText {
494            text: extracted_text,
495            fragments,
496        })
497    }
498
499    /// Sort text fragments by position and merge them appropriately
500    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
501        // Sort fragments by Y position (top to bottom) then X position (left to right)
502        fragments.sort_by(|a, b| {
503            // First compare Y position (with threshold for same line)
504            let y_diff = (b.y - a.y).abs();
505            if y_diff < self.options.newline_threshold {
506                // Same line, sort by X position
507                a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
508            } else {
509                // Different lines, sort by Y (inverted because PDF Y increases upward)
510                b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
511            }
512        });
513
514        // Detect columns if requested
515        if self.options.detect_columns {
516            self.detect_and_sort_columns(fragments);
517        }
518    }
519
520    /// Detect columns and re-sort fragments accordingly
521    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
522        // Group fragments by approximate Y position
523        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
524        let mut current_line: Vec<&mut TextFragment> = Vec::new();
525        let mut last_y = f64::INFINITY;
526
527        for fragment in fragments.iter_mut() {
528            let fragment_y = fragment.y;
529            if (last_y - fragment_y).abs() > self.options.newline_threshold
530                && !current_line.is_empty()
531            {
532                lines.push(current_line);
533                current_line = Vec::new();
534            }
535            current_line.push(fragment);
536            last_y = fragment_y;
537        }
538        if !current_line.is_empty() {
539            lines.push(current_line);
540        }
541
542        // Detect column boundaries
543        let mut column_boundaries = vec![0.0];
544        for line in &lines {
545            if line.len() > 1 {
546                for i in 0..line.len() - 1 {
547                    let gap = line[i + 1].x - (line[i].x + line[i].width);
548                    if gap > self.options.column_threshold {
549                        let boundary = line[i].x + line[i].width + gap / 2.0;
550                        if !column_boundaries
551                            .iter()
552                            .any(|&b| (b - boundary).abs() < 10.0)
553                        {
554                            column_boundaries.push(boundary);
555                        }
556                    }
557                }
558            }
559        }
560        column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
561
562        // Re-sort fragments by column then Y position
563        if column_boundaries.len() > 1 {
564            fragments.sort_by(|a, b| {
565                // Determine column for each fragment
566                let col_a = column_boundaries
567                    .iter()
568                    .position(|&boundary| a.x < boundary)
569                    .unwrap_or(column_boundaries.len())
570                    - 1;
571                let col_b = column_boundaries
572                    .iter()
573                    .position(|&boundary| b.x < boundary)
574                    .unwrap_or(column_boundaries.len())
575                    - 1;
576
577                if col_a != col_b {
578                    col_a.cmp(&col_b)
579                } else {
580                    // Same column, sort by Y position
581                    b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
582                }
583            });
584        }
585    }
586
587    /// Reconstruct text from sorted fragments
588    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
589        // First, merge consecutive fragments that are very close together
590        let merged_fragments = self.merge_close_fragments(fragments);
591
592        let mut result = String::new();
593        let mut last_y = f64::INFINITY;
594        let mut last_x = 0.0;
595        let mut last_line_ended_with_hyphen = false;
596
597        for fragment in &merged_fragments {
598            // Check if we need a newline
599            let y_diff = (last_y - fragment.y).abs();
600            if !result.is_empty() && y_diff > self.options.newline_threshold {
601                // Handle hyphenation
602                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
603                    // Remove the hyphen and don't add newline
604                    if result.ends_with('-') {
605                        result.pop();
606                    }
607                } else {
608                    result.push('\n');
609                }
610            } else if !result.is_empty() {
611                // Check if we need a space
612                let x_gap = fragment.x - last_x;
613                if x_gap > self.options.space_threshold * fragment.font_size {
614                    result.push(' ');
615                }
616            }
617
618            result.push_str(&fragment.text);
619            last_line_ended_with_hyphen = fragment.text.ends_with('-');
620            last_y = fragment.y;
621            last_x = fragment.x + fragment.width;
622        }
623
624        result
625    }
626
627    /// Merge fragments that are very close together on the same line
628    /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
629    fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
630        if fragments.is_empty() {
631            return Vec::new();
632        }
633
634        let mut merged = Vec::new();
635        let mut current = fragments[0].clone();
636
637        for fragment in &fragments[1..] {
638            // Check if this fragment is on the same line and very close
639            let y_diff = (current.y - fragment.y).abs();
640            let x_gap = fragment.x - (current.x + current.width);
641
642            // Merge if on same line and gap is less than a character width
643            // Use 0.5 * font_size as threshold - this catches most artificial spacing
644            let should_merge = y_diff < 1.0  // Same line (very tight tolerance)
645                && x_gap >= 0.0  // Fragment is to the right
646                && x_gap < fragment.font_size * 0.5; // Gap less than 50% of font size
647
648            if should_merge {
649                // Merge this fragment into current
650                current.text.push_str(&fragment.text);
651                current.width = (fragment.x + fragment.width) - current.x;
652            } else {
653                // Start a new fragment
654                merged.push(current);
655                current = fragment.clone();
656            }
657        }
658
659        merged.push(current);
660        merged
661    }
662
663    /// Extract font resources from page
664    fn extract_font_resources<R: Read + Seek>(
665        &mut self,
666        page: &ParsedPage,
667        document: &PdfDocument<R>,
668    ) -> ParseResult<()> {
669        // Clear previous font cache
670        self.font_cache.clear();
671
672        // Try to get resources manually from page dictionary first
673        // This is necessary because ParsedPage.get_resources() may not always work
674        if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
675            if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
676            {
677                if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
678                    // Extract each font
679                    for (font_name, font_obj) in font_dict.0.iter() {
680                        if let Some(font_ref) = font_obj.as_reference() {
681                            if let Ok(PdfObject::Dictionary(font_dict)) =
682                                document.get_object(font_ref.0, font_ref.1)
683                            {
684                                // Create a CMap extractor to use its font extraction logic
685                                let mut cmap_extractor: CMapTextExtractor<R> =
686                                    CMapTextExtractor::new();
687
688                                if let Ok(font_info) =
689                                    cmap_extractor.extract_font_info(&font_dict, document)
690                                {
691                                    let has_to_unicode = font_info.to_unicode.is_some();
692                                    self.font_cache.insert(font_name.0.clone(), font_info);
693                                    tracing::debug!(
694                                        "Cached font: {} (ToUnicode: {})",
695                                        font_name.0,
696                                        has_to_unicode
697                                    );
698                                }
699                            }
700                        }
701                    }
702                }
703            }
704        } else if let Some(resources) = page.get_resources() {
705            // Fallback to get_resources() if Resources is not a reference
706            if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
707                for (font_name, font_obj) in font_dict.0.iter() {
708                    if let Some(font_ref) = font_obj.as_reference() {
709                        if let Ok(PdfObject::Dictionary(font_dict)) =
710                            document.get_object(font_ref.0, font_ref.1)
711                        {
712                            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
713
714                            if let Ok(font_info) =
715                                cmap_extractor.extract_font_info(&font_dict, document)
716                            {
717                                let has_to_unicode = font_info.to_unicode.is_some();
718                                self.font_cache.insert(font_name.0.clone(), font_info);
719                                tracing::debug!(
720                                    "Cached font: {} (ToUnicode: {})",
721                                    font_name.0,
722                                    has_to_unicode
723                                );
724                            }
725                        }
726                    }
727                }
728            }
729        }
730
731        Ok(())
732    }
733
734    /// Decode text using the current font encoding and ToUnicode mapping
735    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
736        use crate::text::encoding::TextEncoding;
737
738        // First, try to use cached font information with ToUnicode CMap
739        if let Some(ref font_name) = state.font_name {
740            if let Some(font_info) = self.font_cache.get(font_name) {
741                // Create a temporary CMapTextExtractor to use its decoding logic
742                let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
743
744                // Try CMap-based decoding first
745                if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
746                    // Only accept if we got meaningful text (not all null bytes or garbage)
747                    if !decoded.trim().is_empty()
748                        && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
749                    {
750                        // Apply sanitization to remove control characters (Issue #116)
751                        let sanitized = sanitize_extracted_text(&decoded);
752                        tracing::debug!(
753                            "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
754                            font_name,
755                            text,
756                            sanitized
757                        );
758                        return Ok(sanitized);
759                    }
760                }
761
762                tracing::debug!(
763                    "CMap decoding failed or produced garbage for font {}, falling back to encoding",
764                    font_name
765                );
766            }
767        }
768
769        // Fall back to encoding-based decoding
770        let encoding = if let Some(ref font_name) = state.font_name {
771            match font_name.to_lowercase().as_str() {
772                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
773                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
774                name if name.contains("standard") => TextEncoding::StandardEncoding,
775                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
776                _ => {
777                    // Default based on common patterns
778                    if font_name.starts_with("Times")
779                        || font_name.starts_with("Helvetica")
780                        || font_name.starts_with("Courier")
781                    {
782                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
783                    } else {
784                        TextEncoding::PdfDocEncoding // Safe default
785                    }
786                }
787            }
788        } else {
789            TextEncoding::WinAnsiEncoding // Default for most PDFs
790        };
791
792        let fallback_result = encoding.decode(text);
793        // Apply sanitization to remove control characters (Issue #116)
794        let sanitized = sanitize_extracted_text(&fallback_result);
795        tracing::debug!(
796            "Fallback encoding decoding: {:?} -> \"{}\"",
797            text,
798            sanitized
799        );
800        Ok(sanitized)
801    }
802}
803
804impl Default for TextExtractor {
805    fn default() -> Self {
806        Self::new()
807    }
808}
809
810/// Multiply two transformation matrices
811fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
812    [
813        a[0] * b[0] + a[1] * b[2],
814        a[0] * b[1] + a[1] * b[3],
815        a[2] * b[0] + a[3] * b[2],
816        a[2] * b[1] + a[3] * b[3],
817        a[4] * b[0] + a[5] * b[2] + b[4],
818        a[4] * b[1] + a[5] * b[3] + b[5],
819    ]
820}
821
822/// Transform a point using a transformation matrix
823fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
824    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
825    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
826    (tx, ty)
827}
828
829/// Calculate text width using actual font metrics (including kerning)
830fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
831    // If we have font metrics, use them for accurate width calculation
832    if let Some(font) = font_info {
833        if let Some(ref widths) = font.metrics.widths {
834            let first_char = font.metrics.first_char.unwrap_or(0);
835            let last_char = font.metrics.last_char.unwrap_or(255);
836            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
837
838            let mut total_width = 0.0;
839            let chars: Vec<char> = text.chars().collect();
840
841            for (i, &ch) in chars.iter().enumerate() {
842                let char_code = ch as u32;
843
844                // Get width from Widths array or use missing_width
845                let width = if char_code >= first_char && char_code <= last_char {
846                    let index = (char_code - first_char) as usize;
847                    widths.get(index).copied().unwrap_or(missing_width)
848                } else {
849                    missing_width
850                };
851
852                // Convert from glyph space (1/1000 units) to user space
853                total_width += width / 1000.0 * font_size;
854
855                // Apply kerning if available (for character pairs)
856                if let Some(ref kerning) = font.metrics.kerning {
857                    if i + 1 < chars.len() {
858                        let next_char = chars[i + 1] as u32;
859                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
860                            // Kerning is in FUnits (1/1000), convert to user space
861                            total_width += kern_value / 1000.0 * font_size;
862                        }
863                    }
864                }
865            }
866
867            return total_width;
868        }
869    }
870
871    // Fallback to simplified calculation if no metrics available
872    text.len() as f64 * font_size * 0.5
873}
874
875/// Sanitize extracted text by removing or replacing control characters.
876///
877/// This function addresses Issue #116 where extracted text contains NUL bytes (`\0`)
878/// and ETX characters (`\u{3}`) where spaces should appear.
879///
880/// # Behavior
881///
882/// - Replaces `\0\u{3}` sequences with a single space (common word separator pattern)
883/// - Replaces standalone `\0` (NUL) with space
884/// - Removes other ASCII control characters (0x01-0x1F) except:
885///   - `\t` (0x09) - Tab
886///   - `\n` (0x0A) - Line feed
887///   - `\r` (0x0D) - Carriage return
888/// - Collapses multiple consecutive spaces into a single space
889///
890/// # Examples
891///
892/// ```
893/// use oxidize_pdf::text::extraction::sanitize_extracted_text;
894///
895/// // Issue #116 pattern: NUL+ETX as word separator
896/// let dirty = "a\0\u{3}sergeant\0\u{3}and";
897/// assert_eq!(sanitize_extracted_text(dirty), "a sergeant and");
898///
899/// // Standalone NUL becomes space
900/// let with_nul = "word\0another";
901/// assert_eq!(sanitize_extracted_text(with_nul), "word another");
902///
903/// // Clean text passes through unchanged
904/// let clean = "Normal text";
905/// assert_eq!(sanitize_extracted_text(clean), "Normal text");
906/// ```
907pub fn sanitize_extracted_text(text: &str) -> String {
908    if text.is_empty() {
909        return String::new();
910    }
911
912    // Pre-allocate with same capacity (result will be <= input length)
913    let mut result = String::with_capacity(text.len());
914    let mut chars = text.chars().peekable();
915    let mut last_was_space = false;
916
917    while let Some(ch) = chars.next() {
918        match ch {
919            // NUL byte - check if followed by ETX for the \0\u{3} pattern
920            '\0' => {
921                // Peek at next char to detect \0\u{3} sequence
922                if chars.peek() == Some(&'\u{3}') {
923                    chars.next(); // consume the ETX
924                }
925                // In both cases (standalone NUL or NUL+ETX), emit space
926                if !last_was_space {
927                    result.push(' ');
928                    last_was_space = true;
929                }
930            }
931
932            // ETX alone (not preceded by NUL) - remove it
933            '\u{3}' => {
934                // Don't emit anything, just skip
935            }
936
937            // Preserve allowed whitespace
938            '\t' | '\n' | '\r' => {
939                result.push(ch);
940                // Reset space tracking on newlines but not tabs
941                last_was_space = ch == '\t';
942            }
943
944            // Regular space - collapse multiples
945            ' ' => {
946                if !last_was_space {
947                    result.push(' ');
948                    last_was_space = true;
949                }
950            }
951
952            // Other control characters (0x01-0x1F except tab/newline/CR) - remove
953            c if c.is_ascii_control() => {
954                // Skip control characters
955            }
956
957            // Normal characters - keep them
958            _ => {
959                result.push(ch);
960                last_was_space = false;
961            }
962        }
963    }
964
965    result
966}
967
968#[cfg(test)]
969mod tests {
970    use super::*;
971
972    #[test]
973    fn test_matrix_multiplication() {
974        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
975        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
976
977        let result = multiply_matrix(&identity, &translation);
978        assert_eq!(result, translation);
979
980        let result2 = multiply_matrix(&translation, &identity);
981        assert_eq!(result2, translation);
982    }
983
984    #[test]
985    fn test_transform_point() {
986        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
987        let (x, y) = transform_point(5.0, 5.0, &translation);
988        assert_eq!(x, 15.0);
989        assert_eq!(y, 25.0);
990    }
991
992    #[test]
993    fn test_extraction_options_default() {
994        let options = ExtractionOptions::default();
995        assert!(!options.preserve_layout);
996        assert_eq!(options.space_threshold, 0.3);
997        assert_eq!(options.newline_threshold, 10.0);
998        assert!(options.sort_by_position);
999        assert!(!options.detect_columns);
1000        assert_eq!(options.column_threshold, 50.0);
1001        assert!(options.merge_hyphenated);
1002    }
1003
1004    #[test]
1005    fn test_extraction_options_custom() {
1006        let options = ExtractionOptions {
1007            preserve_layout: true,
1008            space_threshold: 0.5,
1009            newline_threshold: 15.0,
1010            sort_by_position: false,
1011            detect_columns: true,
1012            column_threshold: 75.0,
1013            merge_hyphenated: false,
1014        };
1015        assert!(options.preserve_layout);
1016        assert_eq!(options.space_threshold, 0.5);
1017        assert_eq!(options.newline_threshold, 15.0);
1018        assert!(!options.sort_by_position);
1019        assert!(options.detect_columns);
1020        assert_eq!(options.column_threshold, 75.0);
1021        assert!(!options.merge_hyphenated);
1022    }
1023
1024    #[test]
1025    fn test_parse_font_style_bold() {
1026        // PostScript style
1027        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1028        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1029
1030        // TrueType style
1031        assert_eq!(parse_font_style("Arial Bold"), (true, false));
1032        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1033
1034        // Short form
1035        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1036    }
1037
1038    #[test]
1039    fn test_parse_font_style_italic() {
1040        // PostScript style
1041        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1042        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1043
1044        // TrueType style
1045        assert_eq!(parse_font_style("Arial Italic"), (false, true));
1046        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1047
1048        // Short form
1049        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1050    }
1051
1052    #[test]
1053    fn test_parse_font_style_bold_italic() {
1054        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1055        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1056        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1057    }
1058
1059    #[test]
1060    fn test_parse_font_style_regular() {
1061        assert_eq!(parse_font_style("Helvetica"), (false, false));
1062        assert_eq!(parse_font_style("Times-Roman"), (false, false));
1063        assert_eq!(parse_font_style("Courier"), (false, false));
1064        assert_eq!(parse_font_style("Arial"), (false, false));
1065    }
1066
1067    #[test]
1068    fn test_parse_font_style_edge_cases() {
1069        // Empty and unusual cases
1070        assert_eq!(parse_font_style(""), (false, false));
1071        assert_eq!(parse_font_style("UnknownFont"), (false, false));
1072
1073        // Case insensitive
1074        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1075        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1076    }
1077
1078    #[test]
1079    fn test_text_fragment() {
1080        let fragment = TextFragment {
1081            text: "Hello".to_string(),
1082            x: 100.0,
1083            y: 200.0,
1084            width: 50.0,
1085            height: 12.0,
1086            font_size: 10.0,
1087            font_name: None,
1088            is_bold: false,
1089            is_italic: false,
1090            color: None,
1091        };
1092        assert_eq!(fragment.text, "Hello");
1093        assert_eq!(fragment.x, 100.0);
1094        assert_eq!(fragment.y, 200.0);
1095        assert_eq!(fragment.width, 50.0);
1096        assert_eq!(fragment.height, 12.0);
1097        assert_eq!(fragment.font_size, 10.0);
1098    }
1099
1100    #[test]
1101    fn test_extracted_text() {
1102        let fragments = vec![
1103            TextFragment {
1104                text: "Hello".to_string(),
1105                x: 100.0,
1106                y: 200.0,
1107                width: 50.0,
1108                height: 12.0,
1109                font_size: 10.0,
1110                font_name: None,
1111                is_bold: false,
1112                is_italic: false,
1113                color: None,
1114            },
1115            TextFragment {
1116                text: "World".to_string(),
1117                x: 160.0,
1118                y: 200.0,
1119                width: 50.0,
1120                height: 12.0,
1121                font_size: 10.0,
1122                font_name: None,
1123                is_bold: false,
1124                is_italic: false,
1125                color: None,
1126            },
1127        ];
1128
1129        let extracted = ExtractedText {
1130            text: "Hello World".to_string(),
1131            fragments: fragments,
1132        };
1133
1134        assert_eq!(extracted.text, "Hello World");
1135        assert_eq!(extracted.fragments.len(), 2);
1136        assert_eq!(extracted.fragments[0].text, "Hello");
1137        assert_eq!(extracted.fragments[1].text, "World");
1138    }
1139
1140    #[test]
1141    fn test_text_state_default() {
1142        let state = TextState::default();
1143        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1144        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1145        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1146        assert_eq!(state.leading, 0.0);
1147        assert_eq!(state.char_space, 0.0);
1148        assert_eq!(state.word_space, 0.0);
1149        assert_eq!(state.horizontal_scale, 100.0);
1150        assert_eq!(state.text_rise, 0.0);
1151        assert_eq!(state.font_size, 0.0);
1152        assert!(state.font_name.is_none());
1153        assert_eq!(state.render_mode, 0);
1154    }
1155
1156    #[test]
1157    fn test_matrix_operations() {
1158        // Test rotation matrix
1159        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
1160        let (x, y) = transform_point(1.0, 0.0, &rotation);
1161        assert_eq!(x, 0.0);
1162        assert_eq!(y, 1.0);
1163
1164        // Test scaling matrix
1165        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1166        let (x, y) = transform_point(5.0, 5.0, &scale);
1167        assert_eq!(x, 10.0);
1168        assert_eq!(y, 15.0);
1169
1170        // Test complex transformation
1171        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1172        let (x, y) = transform_point(1.0, 1.0, &complex);
1173        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
1174        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
1175    }
1176
1177    #[test]
1178    fn test_text_extractor_new() {
1179        let extractor = TextExtractor::new();
1180        let options = extractor.options;
1181        assert!(!options.preserve_layout);
1182        assert_eq!(options.space_threshold, 0.3);
1183        assert_eq!(options.newline_threshold, 10.0);
1184        assert!(options.sort_by_position);
1185        assert!(!options.detect_columns);
1186        assert_eq!(options.column_threshold, 50.0);
1187        assert!(options.merge_hyphenated);
1188    }
1189
1190    #[test]
1191    fn test_text_extractor_with_options() {
1192        let options = ExtractionOptions {
1193            preserve_layout: true,
1194            space_threshold: 0.3,
1195            newline_threshold: 12.0,
1196            sort_by_position: false,
1197            detect_columns: true,
1198            column_threshold: 60.0,
1199            merge_hyphenated: false,
1200        };
1201        let extractor = TextExtractor::with_options(options.clone());
1202        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1203        assert_eq!(extractor.options.space_threshold, options.space_threshold);
1204        assert_eq!(
1205            extractor.options.newline_threshold,
1206            options.newline_threshold
1207        );
1208        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1209        assert_eq!(extractor.options.detect_columns, options.detect_columns);
1210        assert_eq!(extractor.options.column_threshold, options.column_threshold);
1211        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1212    }
1213
1214    // =========================================================================
1215    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
1216    // =========================================================================
1217
1218    #[test]
1219    fn test_calculate_text_width_with_no_font_info() {
1220        // Test fallback: should use simplified calculation
1221        let width = calculate_text_width("Hello", 12.0, None);
1222
1223        // Expected: 5 chars * 12.0 * 0.5 = 30.0
1224        assert_eq!(
1225            width, 30.0,
1226            "Without font info, should use simplified calculation: len * font_size * 0.5"
1227        );
1228    }
1229
1230    #[test]
1231    fn test_calculate_text_width_with_empty_metrics() {
1232        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1233
1234        // Font with no widths array
1235        let font_info = FontInfo {
1236            name: "TestFont".to_string(),
1237            font_type: "Type1".to_string(),
1238            encoding: None,
1239            to_unicode: None,
1240            differences: None,
1241            descendant_font: None,
1242            cid_to_gid_map: None,
1243            metrics: FontMetrics {
1244                first_char: None,
1245                last_char: None,
1246                widths: None,
1247                missing_width: Some(500.0),
1248                kerning: None,
1249            },
1250        };
1251
1252        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1253
1254        // Should fall back to simplified calculation
1255        assert_eq!(
1256            width, 30.0,
1257            "Without widths array, should fall back to simplified calculation"
1258        );
1259    }
1260
1261    #[test]
1262    fn test_calculate_text_width_with_complete_metrics() {
1263        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1264
1265        // Font with complete metrics for ASCII range 32-126
1266        // Simulate typical Helvetica widths (in 1/1000 units)
1267        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
1268
1269        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
1270        widths[72 - 32] = 722.0; // 'H' is ASCII 72
1271        widths[101 - 32] = 556.0; // 'e' is ASCII 101
1272        widths[108 - 32] = 278.0; // 'l' is ASCII 108
1273        widths[111 - 32] = 611.0; // 'o' is ASCII 111
1274
1275        let font_info = FontInfo {
1276            name: "Helvetica".to_string(),
1277            font_type: "Type1".to_string(),
1278            encoding: None,
1279            to_unicode: None,
1280            differences: None,
1281            descendant_font: None,
1282            cid_to_gid_map: None,
1283            metrics: FontMetrics {
1284                first_char: Some(32),
1285                last_char: Some(126),
1286                widths: Some(widths),
1287                missing_width: Some(500.0),
1288                kerning: None,
1289            },
1290        };
1291
1292        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1293
1294        // Expected calculation (widths in glyph space / 1000 * font_size):
1295        // H: 722/1000 * 12 = 8.664
1296        // e: 556/1000 * 12 = 6.672
1297        // l: 278/1000 * 12 = 3.336
1298        // l: 278/1000 * 12 = 3.336
1299        // o: 611/1000 * 12 = 7.332
1300        // Total: 29.34
1301        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1302        let tolerance = 0.0001; // Floating point tolerance
1303        assert!(
1304            (width - expected).abs() < tolerance,
1305            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1306            expected,
1307            width,
1308            (width - expected).abs()
1309        );
1310
1311        // Verify it's different from simplified calculation
1312        let simplified = 5.0 * 12.0 * 0.5; // 30.0
1313        assert_ne!(
1314            width, simplified,
1315            "Metrics-based calculation should differ from simplified (30.0)"
1316        );
1317    }
1318
1319    #[test]
1320    fn test_calculate_text_width_character_outside_range() {
1321        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1322
1323        // Font with narrow range (only covers 'A'-'Z')
1324        let widths = vec![722.0; 26]; // All uppercase letters same width
1325
1326        let font_info = FontInfo {
1327            name: "TestFont".to_string(),
1328            font_type: "Type1".to_string(),
1329            encoding: None,
1330            to_unicode: None,
1331            differences: None,
1332            descendant_font: None,
1333            cid_to_gid_map: None,
1334            metrics: FontMetrics {
1335                first_char: Some(65), // 'A'
1336                last_char: Some(90),  // 'Z'
1337                widths: Some(widths),
1338                missing_width: Some(500.0),
1339                kerning: None,
1340            },
1341        };
1342
1343        // Test with character outside range
1344        let width = calculate_text_width("A1", 10.0, Some(&font_info));
1345
1346        // Expected:
1347        // 'A' (65) is in range: 722/1000 * 10 = 7.22
1348        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
1349        // Total: 12.22
1350        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1351        assert_eq!(
1352            width, expected,
1353            "Should use missing_width for characters outside range"
1354        );
1355    }
1356
1357    #[test]
1358    fn test_calculate_text_width_missing_width_in_array() {
1359        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1360
1361        // Font with incomplete widths array (some characters have 0.0)
1362        let mut widths = vec![500.0; 95]; // Default width
1363        widths[10] = 0.0; // Character at index 10 has no width defined
1364
1365        let font_info = FontInfo {
1366            name: "TestFont".to_string(),
1367            font_type: "Type1".to_string(),
1368            encoding: None,
1369            to_unicode: None,
1370            differences: None,
1371            descendant_font: None,
1372            cid_to_gid_map: None,
1373            metrics: FontMetrics {
1374                first_char: Some(32),
1375                last_char: Some(126),
1376                widths: Some(widths),
1377                missing_width: Some(600.0),
1378                kerning: None,
1379            },
1380        };
1381
1382        // Character 42 (index 10 from first_char 32)
1383        let char_code = 42u8 as char; // '*'
1384        let text = char_code.to_string();
1385        let width = calculate_text_width(&text, 10.0, Some(&font_info));
1386
1387        // Character is in range but width is 0.0, should NOT fall back to missing_width
1388        // (0.0 is a valid width for zero-width characters)
1389        assert_eq!(
1390            width, 0.0,
1391            "Should use 0.0 width from array, not missing_width"
1392        );
1393    }
1394
1395    #[test]
1396    fn test_calculate_text_width_empty_string() {
1397        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1398
1399        let font_info = FontInfo {
1400            name: "TestFont".to_string(),
1401            font_type: "Type1".to_string(),
1402            encoding: None,
1403            to_unicode: None,
1404            differences: None,
1405            descendant_font: None,
1406            cid_to_gid_map: None,
1407            metrics: FontMetrics {
1408                first_char: Some(32),
1409                last_char: Some(126),
1410                widths: Some(vec![500.0; 95]),
1411                missing_width: Some(500.0),
1412                kerning: None,
1413            },
1414        };
1415
1416        let width = calculate_text_width("", 12.0, Some(&font_info));
1417        assert_eq!(width, 0.0, "Empty string should have zero width");
1418
1419        // Also test without font info
1420        let width_no_font = calculate_text_width("", 12.0, None);
1421        assert_eq!(
1422            width_no_font, 0.0,
1423            "Empty string should have zero width (no font)"
1424        );
1425    }
1426
1427    #[test]
1428    fn test_calculate_text_width_unicode_characters() {
1429        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1430
1431        // Font with limited ASCII range
1432        let font_info = FontInfo {
1433            name: "TestFont".to_string(),
1434            font_type: "Type1".to_string(),
1435            encoding: None,
1436            to_unicode: None,
1437            differences: None,
1438            descendant_font: None,
1439            cid_to_gid_map: None,
1440            metrics: FontMetrics {
1441                first_char: Some(32),
1442                last_char: Some(126),
1443                widths: Some(vec![500.0; 95]),
1444                missing_width: Some(600.0),
1445                kerning: None,
1446            },
1447        };
1448
1449        // Test with Unicode characters outside ASCII range
1450        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1451
1452        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
1453        // Expected: 600/1000 * 10 = 6.0
1454        assert_eq!(
1455            width, 6.0,
1456            "Unicode character outside range should use missing_width"
1457        );
1458    }
1459
1460    #[test]
1461    fn test_calculate_text_width_different_font_sizes() {
1462        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1463
1464        let font_info = FontInfo {
1465            name: "TestFont".to_string(),
1466            font_type: "Type1".to_string(),
1467            encoding: None,
1468            to_unicode: None,
1469            differences: None,
1470            descendant_font: None,
1471            cid_to_gid_map: None,
1472            metrics: FontMetrics {
1473                first_char: Some(65), // 'A'
1474                last_char: Some(65),  // 'A'
1475                widths: Some(vec![722.0]),
1476                missing_width: Some(500.0),
1477                kerning: None,
1478            },
1479        };
1480
1481        // Test same character with different font sizes
1482        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1483        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1484
1485        // Widths should scale linearly with font size
1486        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1487        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1488        assert_eq!(
1489            width_20,
1490            width_10 * 2.0,
1491            "Width should scale linearly with font size"
1492        );
1493    }
1494
1495    #[test]
1496    fn test_calculate_text_width_proportional_vs_monospace() {
1497        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1498
1499        // Simulate proportional font (different widths)
1500        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
1501        let proportional_font = FontInfo {
1502            name: "Helvetica".to_string(),
1503            font_type: "Type1".to_string(),
1504            encoding: None,
1505            to_unicode: None,
1506            differences: None,
1507            descendant_font: None,
1508            cid_to_gid_map: None,
1509            metrics: FontMetrics {
1510                first_char: Some(105), // 'i'
1511                last_char: Some(107),  // covers i, j, k
1512                widths: Some(proportional_widths),
1513                missing_width: Some(500.0),
1514                kerning: None,
1515            },
1516        };
1517
1518        // Simulate monospace font (same width)
1519        let monospace_widths = vec![600.0, 600.0, 600.0];
1520        let monospace_font = FontInfo {
1521            name: "Courier".to_string(),
1522            font_type: "Type1".to_string(),
1523            encoding: None,
1524            to_unicode: None,
1525            differences: None,
1526            descendant_font: None,
1527            cid_to_gid_map: None,
1528            metrics: FontMetrics {
1529                first_char: Some(105),
1530                last_char: Some(107),
1531                widths: Some(monospace_widths),
1532                missing_width: Some(600.0),
1533                kerning: None,
1534            },
1535        };
1536
1537        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1538        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1539
1540        // Proportional 'i' should be narrower than monospace 'i'
1541        assert!(
1542            prop_width < mono_width,
1543            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1544            prop_width,
1545            mono_width
1546        );
1547    }
1548
1549    // =========================================================================
1550    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
1551    // =========================================================================
1552
1553    #[test]
1554    fn test_calculate_text_width_with_kerning() {
1555        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1556        use std::collections::HashMap;
1557
1558        // Create a font with kerning pairs
1559        let mut widths = vec![500.0; 95]; // ASCII 32-126
1560        widths[65 - 32] = 722.0; // 'A'
1561        widths[86 - 32] = 722.0; // 'V'
1562        widths[87 - 32] = 944.0; // 'W'
1563
1564        let mut kerning = HashMap::new();
1565        // Typical kerning pairs (in FUnits, 1/1000)
1566        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
1567        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
1568
1569        let font_info = FontInfo {
1570            name: "Helvetica".to_string(),
1571            font_type: "Type1".to_string(),
1572            encoding: None,
1573            to_unicode: None,
1574            differences: None,
1575            descendant_font: None,
1576            cid_to_gid_map: None,
1577            metrics: FontMetrics {
1578                first_char: Some(32),
1579                last_char: Some(126),
1580                widths: Some(widths),
1581                missing_width: Some(500.0),
1582                kerning: Some(kerning),
1583            },
1584        };
1585
1586        // Test "AV" with kerning
1587        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1588        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
1589        //         = 17.328 - 0.6 = 16.728
1590        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1591        let tolerance = 0.0001;
1592        assert!(
1593            (width_av - expected_av).abs() < tolerance,
1594            "AV with kerning: expected {}, got {}, diff {}",
1595            expected_av,
1596            width_av,
1597            (width_av - expected_av).abs()
1598        );
1599
1600        // Test "AW" with different kerning value
1601        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1602        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
1603        //         = 19.992 - 0.48 = 19.512
1604        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1605        assert!(
1606            (width_aw - expected_aw).abs() < tolerance,
1607            "AW with kerning: expected {}, got {}, diff {}",
1608            expected_aw,
1609            width_aw,
1610            (width_aw - expected_aw).abs()
1611        );
1612
1613        // Test "VA" with NO kerning (pair not in HashMap)
1614        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1615        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
1616        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1617        assert!(
1618            (width_va - expected_va).abs() < tolerance,
1619            "VA without kerning: expected {}, got {}, diff {}",
1620            expected_va,
1621            width_va,
1622            (width_va - expected_va).abs()
1623        );
1624
1625        // Verify kerning makes a measurable difference
1626        assert!(
1627            width_av < width_va,
1628            "AV with kerning ({}) should be narrower than VA without kerning ({})",
1629            width_av,
1630            width_va
1631        );
1632    }
1633
1634    #[test]
1635    fn test_parse_truetype_kern_table_minimal() {
1636        use crate::text::extraction_cmap::parse_truetype_kern_table;
1637
1638        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
1639        // Structure:
1640        // 1. Offset table (12 bytes)
1641        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
1642        // 3. 'head' table data (54 bytes)
1643        // 4. 'kern' table data (30 bytes)
1644        // Total: 128 bytes
1645        let mut ttf_data = vec![
1646            // Offset table
1647            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1648            0x00, 0x02, // numTables: 2
1649            0x00, 0x20, // searchRange: 32
1650            0x00, 0x01, // entrySelector: 1
1651            0x00, 0x00, // rangeShift: 0
1652        ];
1653
1654        // Table directory entry 1: 'head' table
1655        ttf_data.extend_from_slice(b"head"); // tag
1656        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1657        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
1658        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
1659
1660        // Table directory entry 2: 'kern' table
1661        ttf_data.extend_from_slice(b"kern"); // tag
1662        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
1663        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
1664        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
1665
1666        // 'head' table data (54 bytes of zeros - minimal valid head table)
1667        ttf_data.extend_from_slice(&[0u8; 54]);
1668
1669        // 'kern' table data (34 bytes)
1670        ttf_data.extend_from_slice(&[
1671            // Kern table header
1672            0x00, 0x00, // version: 0
1673            0x00, 0x01, // nTables: 1
1674            // Subtable header
1675            0x00, 0x00, // version: 0
1676            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
1677            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
1678            0x00, 0x02, // nPairs: 2
1679            0x00, 0x08, // searchRange: 8
1680            0x00, 0x00, // entrySelector: 0
1681            0x00, 0x04, // rangeShift: 4
1682            // Kerning pair 1: A + V → -50
1683            0x00, 0x41, // left glyph: 65 ('A')
1684            0x00, 0x56, // right glyph: 86 ('V')
1685            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
1686            // Kerning pair 2: A + W → -40
1687            0x00, 0x41, // left glyph: 65 ('A')
1688            0x00, 0x57, // right glyph: 87 ('W')
1689            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
1690        ]);
1691
1692        let result = parse_truetype_kern_table(&ttf_data);
1693        assert!(
1694            result.is_ok(),
1695            "Should parse minimal kern table successfully: {:?}",
1696            result.err()
1697        );
1698
1699        let kerning_map = result.unwrap();
1700        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1701
1702        // Verify pair 1: A + V → -50
1703        assert_eq!(
1704            kerning_map.get(&(65, 86)),
1705            Some(&-50.0),
1706            "Should have A+V kerning pair with value -50"
1707        );
1708
1709        // Verify pair 2: A + W → -40
1710        assert_eq!(
1711            kerning_map.get(&(65, 87)),
1712            Some(&-40.0),
1713            "Should have A+W kerning pair with value -40"
1714        );
1715    }
1716
1717    #[test]
1718    fn test_parse_kern_table_no_kern_table() {
1719        use crate::text::extraction_cmap::extract_truetype_kerning;
1720
1721        // TrueType font data WITHOUT a 'kern' table
1722        // Structure:
1723        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
1724        // - Table directory: 1 entry for 'head' table (not 'kern')
1725        let ttf_data = vec![
1726            // Offset table
1727            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
1728            0x00, 0x01, // numTables: 1
1729            0x00, 0x10, // searchRange: 16
1730            0x00, 0x00, // entrySelector: 0
1731            0x00, 0x00, // rangeShift: 0
1732            // Table directory entry: 'head' table (not 'kern')
1733            b'h', b'e', b'a', b'd', // tag: 'head'
1734            0x00, 0x00, 0x00, 0x00, // checksum
1735            0x00, 0x00, 0x00, 0x1C, // offset: 28
1736            0x00, 0x00, 0x00, 0x36, // length: 54
1737            // Mock 'head' table data (54 bytes of zeros)
1738            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1739            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1740            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1741            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1742        ];
1743
1744        let result = extract_truetype_kerning(&ttf_data);
1745        assert!(
1746            result.is_ok(),
1747            "Should gracefully handle missing kern table"
1748        );
1749
1750        let kerning_map = result.unwrap();
1751        assert!(
1752            kerning_map.is_empty(),
1753            "Should return empty HashMap when no kern table exists"
1754        );
1755    }
1756}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs