oxidize_pdf/parser/
encoding.rs

1//! Character Encoding Detection and Conversion Module
2//!
3//! This module provides robust character encoding detection and conversion capabilities
4//! to handle the diverse encoding scenarios found in real-world PDF files.
5//!
6//! # Overview
7//!
8//! Many PDFs contain text encoded in various character sets beyond UTF-8, including:
9//! - Latin-1 (ISO 8859-1) - Common in European documents
10//! - Windows-1252 - Microsoft's extension of Latin-1
11//! - MacRoman - Apple's legacy encoding
12//! - Various PDF-specific encodings
13//!
14//! This module provides automatic detection and graceful conversion with fallback
15//! handling for unrecognized characters.
16
17use crate::error::PdfError;
18use std::collections::HashMap;
19
20/// Character encoding detection and conversion result
21#[derive(Debug, Clone)]
22pub struct EncodingResult {
23    /// The decoded text
24    pub text: String,
25    /// Detected encoding (if any)
26    pub detected_encoding: Option<EncodingType>,
27    /// Number of replacement characters used
28    pub replacement_count: usize,
29    /// Confidence level (0.0 to 1.0)
30    pub confidence: f64,
31}
32
33/// Supported encoding types for PDF text
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum EncodingType {
36    /// UTF-8 (modern standard)
37    Utf8,
38    /// Latin-1 / ISO 8859-1 (European)
39    Latin1,
40    /// Windows-1252 (Microsoft extension of Latin-1)
41    Windows1252,
42    /// MacRoman (Apple legacy)
43    MacRoman,
44    /// PDF built-in encoding
45    PdfDocEncoding,
46    /// Unknown/Mixed encoding
47    Mixed,
48}
49
50impl EncodingType {
51    /// Get human-readable name
52    pub fn name(&self) -> &'static str {
53        match self {
54            EncodingType::Utf8 => "UTF-8",
55            EncodingType::Latin1 => "ISO 8859-1 (Latin-1)",
56            EncodingType::Windows1252 => "Windows-1252",
57            EncodingType::MacRoman => "MacRoman",
58            EncodingType::PdfDocEncoding => "PDFDocEncoding",
59            EncodingType::Mixed => "Mixed/Unknown",
60        }
61    }
62}
63
64/// Configuration for character encoding processing
65#[derive(Debug, Clone)]
66pub struct EncodingOptions {
67    /// Enable lenient mode (use replacement characters instead of failing)
68    pub lenient_mode: bool,
69    /// Prefer specific encoding for detection
70    pub preferred_encoding: Option<EncodingType>,
71    /// Maximum replacement characters before giving up
72    pub max_replacements: usize,
73    /// Log problematic characters for analysis
74    pub log_issues: bool,
75}
76
77impl Default for EncodingOptions {
78    fn default() -> Self {
79        Self {
80            lenient_mode: true,
81            preferred_encoding: None,
82            max_replacements: 100,
83            log_issues: false,
84        }
85    }
86}
87
88/// Main character decoder trait
89pub trait CharacterDecoder {
90    /// Decode bytes to string with encoding detection
91    fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError>;
92
93    /// Detect the most likely encoding for the given bytes
94    fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType>;
95
96    /// Convert bytes using a specific encoding
97    fn decode_with_encoding(
98        &self,
99        bytes: &[u8],
100        encoding: EncodingType,
101        lenient: bool,
102    ) -> Result<String, PdfError>;
103}
104
105/// Enhanced character decoder implementation
106pub struct EnhancedDecoder {
107    /// Latin-1 to Unicode mapping
108    latin1_map: HashMap<u8, char>,
109    /// Windows-1252 to Unicode mapping
110    windows1252_map: HashMap<u8, char>,
111    /// MacRoman to Unicode mapping
112    macroman_map: HashMap<u8, char>,
113    /// Issue logger for analysis
114    issue_log: Vec<EncodingIssue>,
115}
116
117/// Information about encoding issues encountered
118#[derive(Debug, Clone)]
119pub struct EncodingIssue {
120    pub byte_value: u8,
121    pub context: String,
122    pub attempted_encodings: Vec<EncodingType>,
123    pub resolution: IssueResolution,
124}
125
126#[derive(Debug, Clone)]
127pub enum IssueResolution {
128    ReplacementCharacter,
129    SuccessfulConversion(char),
130    Skipped,
131}
132
133impl EnhancedDecoder {
134    /// Create a new enhanced decoder with all encoding tables loaded
135    pub fn new() -> Self {
136        let mut decoder = Self {
137            latin1_map: HashMap::new(),
138            windows1252_map: HashMap::new(),
139            macroman_map: HashMap::new(),
140            issue_log: Vec::new(),
141        };
142
143        decoder.initialize_encoding_tables();
144        decoder
145    }
146
147    /// Initialize all encoding conversion tables
148    fn initialize_encoding_tables(&mut self) {
149        // Latin-1 (ISO 8859-1) mapping - direct 1:1 mapping for 0x80-0xFF
150        for i in 0x80..=0xFF {
151            // Safety: All values in 0x80-0xFF range are valid Unicode codepoints (ISO/IEC 8859-1)
152            // This is guaranteed by the Unicode standard which includes Latin-1 supplement block
153            if let Some(ch) = char::from_u32(i as u32) {
154                self.latin1_map.insert(i, ch);
155            }
156        }
157
158        // Windows-1252 mapping (extends Latin-1 for 0x80-0x9F range)
159        let windows1252_extensions = [
160            (0x80, '€'),        // Euro sign
161            (0x82, '‚'),        // Single low-9 quotation mark
162            (0x83, 'ƒ'),        // Latin small letter f with hook
163            (0x84, '„'),        // Double low-9 quotation mark
164            (0x85, '…'),        // Horizontal ellipsis
165            (0x86, '†'),        // Dagger
166            (0x87, '‡'),        // Double dagger
167            (0x88, 'ˆ'),        // Modifier letter circumflex accent
168            (0x89, '‰'),        // Per mille sign
169            (0x8A, 'Š'),        // Latin capital letter S with caron
170            (0x8B, '‹'),        // Single left-pointing angle quotation mark
171            (0x8C, 'Œ'),        // Latin capital ligature OE
172            (0x8E, 'Ž'),        // Latin capital letter Z with caron
173            (0x91, '\u{2018}'), // Left single quotation mark
174            (0x92, '\u{2019}'), // Right single quotation mark
175            (0x93, '\u{201C}'), // Left double quotation mark
176            (0x94, '\u{201D}'), // Right double quotation mark
177            (0x95, '•'),        // Bullet
178            (0x96, '–'),        // En dash
179            (0x97, '—'),        // Em dash
180            (0x98, '˜'),        // Small tilde
181            (0x99, '™'),        // Trade mark sign
182            (0x9A, 'š'),        // Latin small letter s with caron
183            (0x9B, '›'),        // Single right-pointing angle quotation mark
184            (0x9C, 'œ'),        // Latin small ligature oe
185            (0x9E, 'ž'),        // Latin small letter z with caron
186            (0x9F, 'Ÿ'),        // Latin capital letter Y with diaeresis
187        ];
188
189        // Copy Latin-1 base
190        self.windows1252_map = self.latin1_map.clone();
191        // Override with Windows-1252 extensions
192        for (byte, ch) in windows1252_extensions.iter() {
193            self.windows1252_map.insert(*byte, *ch);
194        }
195
196        // MacRoman mapping (partial - most common characters)
197        let macroman_chars = [
198            (0x80, 'Ä'),
199            (0x81, 'Å'),
200            (0x82, 'Ç'),
201            (0x83, 'É'),
202            (0x84, 'Ñ'),
203            (0x85, 'Ö'),
204            (0x86, 'Ü'),
205            (0x87, 'á'),
206            (0x88, 'à'),
207            (0x89, 'â'),
208            (0x8A, 'ä'),
209            (0x8B, 'ã'),
210            (0x8C, 'å'),
211            (0x8D, 'ç'),
212            (0x8E, 'é'),
213            (0x8F, 'è'),
214            (0x90, 'ê'),
215            (0x91, 'ë'),
216            (0x92, 'í'),
217            (0x93, 'ì'),
218            (0x94, 'î'),
219            (0x95, 'ï'),
220            (0x96, 'ñ'),
221            (0x97, 'ó'),
222            (0x98, 'ò'),
223            (0x99, 'ô'),
224            (0x9A, 'ö'),
225            (0x9B, 'õ'),
226            (0x9C, 'ú'),
227            (0x9D, 'ù'),
228            (0x9E, 'û'),
229            (0x9F, 'ü'),
230            (0xA0, '†'),
231            (0xA1, '°'),
232            (0xA2, '¢'),
233            (0xA3, '£'),
234            (0xA4, '§'),
235            (0xA5, '•'),
236            (0xA6, '¶'),
237            (0xA7, 'ß'),
238            (0xA8, '®'),
239            (0xA9, '©'),
240            (0xAA, '™'),
241            (0xAB, '´'),
242            (0xAC, '¨'),
243            (0xAD, '≠'),
244            (0xAE, 'Æ'),
245            (0xAF, 'Ø'),
246        ];
247
248        for (byte, ch) in macroman_chars.iter() {
249            self.macroman_map.insert(*byte, *ch);
250        }
251    }
252
253    /// Clear the issue log
254    pub fn clear_log(&mut self) {
255        self.issue_log.clear();
256    }
257
258    /// Get the current issue log
259    pub fn get_issues(&self) -> &[EncodingIssue] {
260        &self.issue_log
261    }
262
263    /// Log an encoding issue
264    #[allow(dead_code)]
265    fn log_issue(&mut self, issue: EncodingIssue) {
266        self.issue_log.push(issue);
267    }
268
269    /// Analyze bytes to detect most likely encoding
270    fn analyze_encoding_indicators(&self, bytes: &[u8]) -> Vec<(EncodingType, f64)> {
271        let mut scores = vec![
272            (EncodingType::Utf8, 0.0),
273            (EncodingType::Latin1, 0.0),
274            (EncodingType::Windows1252, 0.0),
275            (EncodingType::MacRoman, 0.0),
276        ];
277
278        // UTF-8 validity check
279        if std::str::from_utf8(bytes).is_ok() {
280            scores[0].1 = 0.9; // High confidence for valid UTF-8
281        }
282
283        // Check for Windows-1252 specific characters
284        let mut windows1252_indicators = 0;
285        let mut latin1_indicators = 0;
286        let mut macroman_indicators = 0;
287
288        for &byte in bytes {
289            if byte >= 0x80 {
290                // Count high-bit characters
291                if self.windows1252_map.contains_key(&byte) {
292                    windows1252_indicators += 1;
293                    // Special boost for Windows-1252 specific chars
294                    if matches!(byte, 0x80 | 0x82..=0x8C | 0x8E | 0x91..=0x9C | 0x9E | 0x9F) {
295                        scores[2].1 += 0.1;
296                    }
297                }
298                if self.latin1_map.contains_key(&byte) {
299                    latin1_indicators += 1;
300                }
301                if self.macroman_map.contains_key(&byte) {
302                    macroman_indicators += 1;
303                }
304            }
305        }
306
307        // Adjust scores based on indicators
308        if windows1252_indicators > 0 {
309            scores[2].1 += 0.3;
310        }
311        if latin1_indicators > 0 {
312            scores[1].1 += 0.2;
313        }
314        if macroman_indicators > 0 {
315            scores[3].1 += 0.1;
316        }
317
318        // Sort by confidence score
319        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
320        scores
321    }
322}
323
324impl Default for EnhancedDecoder {
325    fn default() -> Self {
326        Self::new()
327    }
328}
329
330impl CharacterDecoder for EnhancedDecoder {
331    fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError> {
332        // Try preferred encoding first
333        if let Some(preferred) = options.preferred_encoding {
334            if let Ok(text) = self.decode_with_encoding(bytes, preferred, options.lenient_mode) {
335                let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
336                return Ok(EncodingResult {
337                    text,
338                    detected_encoding: Some(preferred),
339                    replacement_count,
340                    confidence: 0.8,
341                });
342            }
343        }
344
345        // Auto-detect encoding
346        let encoding_candidates = self.analyze_encoding_indicators(bytes);
347
348        for (encoding, confidence) in encoding_candidates {
349            if confidence > 0.1 {
350                match self.decode_with_encoding(bytes, encoding, options.lenient_mode) {
351                    Ok(text) => {
352                        let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
353
354                        if replacement_count <= options.max_replacements {
355                            return Ok(EncodingResult {
356                                text,
357                                detected_encoding: Some(encoding),
358                                replacement_count,
359                                confidence,
360                            });
361                        }
362                    }
363                    Err(_) => continue,
364                }
365            }
366        }
367
368        // Last resort: UTF-8 with replacement
369        if options.lenient_mode {
370            let text = String::from_utf8_lossy(bytes).to_string();
371            let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
372
373            Ok(EncodingResult {
374                text,
375                detected_encoding: Some(EncodingType::Mixed),
376                replacement_count,
377                confidence: 0.1,
378            })
379        } else {
380            Err(PdfError::EncodingError(
381                "Failed to decode text with any supported encoding".to_string(),
382            ))
383        }
384    }
385
386    fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType> {
387        let candidates = self.analyze_encoding_indicators(bytes);
388        candidates.first().map(|(encoding, _)| *encoding)
389    }
390
391    fn decode_with_encoding(
392        &self,
393        bytes: &[u8],
394        encoding: EncodingType,
395        lenient: bool,
396    ) -> Result<String, PdfError> {
397        match encoding {
398            EncodingType::Utf8 => {
399                if lenient {
400                    Ok(String::from_utf8_lossy(bytes).to_string())
401                } else {
402                    String::from_utf8(bytes.to_vec())
403                        .map_err(|e| PdfError::EncodingError(format!("UTF-8 decoding failed: {e}")))
404                }
405            }
406
407            EncodingType::Latin1 => {
408                let mut result = String::with_capacity(bytes.len());
409                for &byte in bytes {
410                    if byte < 0x80 {
411                        result.push(byte as char);
412                    } else if let Some(&ch) = self.latin1_map.get(&byte) {
413                        result.push(ch);
414                    } else if lenient {
415                        result.push('\u{FFFD}');
416                    } else {
417                        return Err(PdfError::EncodingError(format!(
418                            "Invalid Latin-1 character: 0x{byte:02X}"
419                        )));
420                    }
421                }
422                Ok(result)
423            }
424
425            EncodingType::Windows1252 => {
426                let mut result = String::with_capacity(bytes.len());
427                for &byte in bytes {
428                    if byte < 0x80 {
429                        result.push(byte as char);
430                    } else if let Some(&ch) = self.windows1252_map.get(&byte) {
431                        result.push(ch);
432                    } else if lenient {
433                        result.push('\u{FFFD}');
434                    } else {
435                        return Err(PdfError::EncodingError(format!(
436                            "Invalid Windows-1252 character: 0x{byte:02X}"
437                        )));
438                    }
439                }
440                Ok(result)
441            }
442
443            EncodingType::MacRoman => {
444                let mut result = String::with_capacity(bytes.len());
445                for &byte in bytes {
446                    if byte < 0x80 {
447                        result.push(byte as char);
448                    } else if let Some(&ch) = self.macroman_map.get(&byte) {
449                        result.push(ch);
450                    } else if lenient {
451                        result.push('\u{FFFD}');
452                    } else {
453                        return Err(PdfError::EncodingError(format!(
454                            "Invalid MacRoman character: 0x{byte:02X}"
455                        )));
456                    }
457                }
458                Ok(result)
459            }
460
461            EncodingType::PdfDocEncoding => {
462                // PDFDocEncoding is identical to Latin-1 for now
463                self.decode_with_encoding(bytes, EncodingType::Latin1, lenient)
464            }
465
466            EncodingType::Mixed => {
467                // Try multiple encodings and pick the best result
468                let candidates = [
469                    EncodingType::Utf8,
470                    EncodingType::Windows1252,
471                    EncodingType::Latin1,
472                    EncodingType::MacRoman,
473                ];
474
475                for candidate in &candidates {
476                    if let Ok(result) = self.decode_with_encoding(bytes, *candidate, true) {
477                        let replacement_count = result.chars().filter(|&c| c == '\u{FFFD}').count();
478                        if replacement_count < bytes.len() / 4 {
479                            // Less than 25% replacement chars
480                            return Ok(result);
481                        }
482                    }
483                }
484
485                // Fallback to UTF-8 lossy
486                Ok(String::from_utf8_lossy(bytes).to_string())
487            }
488        }
489    }
490}
491
492/// Convenience function to decode bytes with default settings
493pub fn decode_text(bytes: &[u8]) -> Result<String, PdfError> {
494    let decoder = EnhancedDecoder::new();
495    let options = EncodingOptions::default();
496    let result = decoder.decode(bytes, &options)?;
497    Ok(result.text)
498}
499
500/// Convenience function to decode bytes with specific encoding
501pub fn decode_text_with_encoding(bytes: &[u8], encoding: EncodingType) -> Result<String, PdfError> {
502    let decoder = EnhancedDecoder::new();
503    decoder.decode_with_encoding(bytes, encoding, true)
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn test_utf8_decoding() {
512        let decoder = EnhancedDecoder::new();
513        let options = EncodingOptions::default();
514
515        let utf8_text = "Hello, 世界!";
516        let bytes = utf8_text.as_bytes();
517
518        let result = decoder.decode(bytes, &options).unwrap();
519        assert_eq!(result.text, utf8_text);
520        assert_eq!(result.detected_encoding, Some(EncodingType::Utf8));
521        assert_eq!(result.replacement_count, 0);
522    }
523
524    #[test]
525    fn test_latin1_decoding() {
526        let decoder = EnhancedDecoder::new();
527        let options = EncodingOptions::default();
528
529        // Latin-1 text with accented characters
530        let bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2C, 0x20, 0xE9, 0xE8, 0xE7]; // "Hello, éèç"
531
532        let result = decoder.decode(&bytes, &options).unwrap();
533        assert!(result.text.contains("éèç"));
534    }
535
536    #[test]
537    fn test_windows1252_decoding() {
538        let decoder = EnhancedDecoder::new();
539        let options = EncodingOptions::default();
540
541        // Windows-1252 text with Euro sign and smart quotes
542        let bytes = vec![0x80, 0x20, 0x91, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x92]; // "€ 'Hello'"
543
544        let result = decoder.decode(&bytes, &options).unwrap();
545        assert!(result.text.contains("€"));
546        assert!(result.text.contains('\u{2018}')); // Left single quote
547        assert!(result.text.contains('\u{2019}')); // Right single quote
548    }
549
550    #[test]
551    fn test_lenient_mode() {
552        let decoder = EnhancedDecoder::new();
553        let mut options = EncodingOptions::default();
554        options.lenient_mode = true;
555        options.preferred_encoding = Some(EncodingType::Utf8); // Force UTF-8 to get replacement chars
556
557        // Invalid UTF-8 sequence (will cause replacement chars in UTF-8 mode)
558        let bytes = vec![0xFF, 0xFE, 0x48, 0x65, 0x6C, 0x6C, 0x6F]; // Invalid UTF-8 + "Hello"
559
560        let result = decoder.decode(&bytes, &options).unwrap();
561        assert!(
562            result.replacement_count > 0,
563            "Expected replacement chars, got {}",
564            result.replacement_count
565        );
566        assert!(result.text.contains("Hello"));
567    }
568
569    #[test]
570    fn test_encoding_detection() {
571        let decoder = EnhancedDecoder::new();
572
573        // UTF-8
574        let utf8_bytes = "Hello, 世界!".as_bytes();
575        assert_eq!(
576            decoder.detect_encoding(utf8_bytes),
577            Some(EncodingType::Utf8)
578        );
579
580        // Windows-1252 with Euro sign
581        let win1252_bytes = vec![0x80, 0x20, 0x48, 0x65, 0x6C, 0x6C, 0x6F];
582        let detected = decoder.detect_encoding(&win1252_bytes);
583        assert!(matches!(
584            detected,
585            Some(EncodingType::Windows1252) | Some(EncodingType::Latin1)
586        ));
587    }
588
589    #[test]
590    fn test_specific_encoding() {
591        let decoder = EnhancedDecoder::new();
592
593        let bytes = vec![0xC9]; // É in Latin-1
594
595        let latin1_result = decoder
596            .decode_with_encoding(&bytes, EncodingType::Latin1, false)
597            .unwrap();
598        assert_eq!(latin1_result, "É");
599
600        let win1252_result = decoder
601            .decode_with_encoding(&bytes, EncodingType::Windows1252, false)
602            .unwrap();
603        assert_eq!(win1252_result, "É");
604    }
605
606    #[test]
607    fn test_convenience_functions() {
608        let utf8_text = "Hello, world!";
609        let bytes = utf8_text.as_bytes();
610
611        let decoded = decode_text(bytes).unwrap();
612        assert_eq!(decoded, utf8_text);
613
614        let latin1_bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0xE9]; // "Hellé"
615        let decoded = decode_text_with_encoding(&latin1_bytes, EncodingType::Latin1).unwrap();
616        assert!(decoded.contains("é"));
617    }
618}