oxidize_pdf/parser/
encoding.rs

1//! Character Encoding Detection and Conversion Module
2//!
3//! This module provides robust character encoding detection and conversion capabilities
4//! to handle the diverse encoding scenarios found in real-world PDF files.
5//!
6//! # Overview
7//!
8//! Many PDFs contain text encoded in various character sets beyond UTF-8, including:
9//! - Latin-1 (ISO 8859-1) - Common in European documents
10//! - Windows-1252 - Microsoft's extension of Latin-1
11//! - MacRoman - Apple's legacy encoding
12//! - Various PDF-specific encodings
13//!
14//! This module provides automatic detection and graceful conversion with fallback
15//! handling for unrecognized characters.
16
17use crate::error::PdfError;
18use std::collections::HashMap;
19
20/// Character encoding detection and conversion result
21#[derive(Debug, Clone)]
22pub struct EncodingResult {
23    /// The decoded text
24    pub text: String,
25    /// Detected encoding (if any)
26    pub detected_encoding: Option<EncodingType>,
27    /// Number of replacement characters used
28    pub replacement_count: usize,
29    /// Confidence level (0.0 to 1.0)
30    pub confidence: f64,
31}
32
33/// Supported encoding types for PDF text
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum EncodingType {
36    /// UTF-8 (modern standard)
37    Utf8,
38    /// Latin-1 / ISO 8859-1 (European)
39    Latin1,
40    /// Windows-1252 (Microsoft extension of Latin-1)
41    Windows1252,
42    /// MacRoman (Apple legacy)
43    MacRoman,
44    /// PDF built-in encoding
45    PdfDocEncoding,
46    /// Unknown/Mixed encoding
47    Mixed,
48}
49
50impl EncodingType {
51    /// Get human-readable name
52    pub fn name(&self) -> &'static str {
53        match self {
54            EncodingType::Utf8 => "UTF-8",
55            EncodingType::Latin1 => "ISO 8859-1 (Latin-1)",
56            EncodingType::Windows1252 => "Windows-1252",
57            EncodingType::MacRoman => "MacRoman",
58            EncodingType::PdfDocEncoding => "PDFDocEncoding",
59            EncodingType::Mixed => "Mixed/Unknown",
60        }
61    }
62}
63
64/// Configuration for character encoding processing
65#[derive(Debug, Clone)]
66pub struct EncodingOptions {
67    /// Enable lenient mode (use replacement characters instead of failing)
68    pub lenient_mode: bool,
69    /// Prefer specific encoding for detection
70    pub preferred_encoding: Option<EncodingType>,
71    /// Maximum replacement characters before giving up
72    pub max_replacements: usize,
73    /// Log problematic characters for analysis
74    pub log_issues: bool,
75}
76
77impl Default for EncodingOptions {
78    fn default() -> Self {
79        Self {
80            lenient_mode: true,
81            preferred_encoding: None,
82            max_replacements: 100,
83            log_issues: false,
84        }
85    }
86}
87
88/// Main character decoder trait
89pub trait CharacterDecoder {
90    /// Decode bytes to string with encoding detection
91    fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError>;
92
93    /// Detect the most likely encoding for the given bytes
94    fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType>;
95
96    /// Convert bytes using a specific encoding
97    fn decode_with_encoding(
98        &self,
99        bytes: &[u8],
100        encoding: EncodingType,
101        lenient: bool,
102    ) -> Result<String, PdfError>;
103}
104
105/// Enhanced character decoder implementation
106pub struct EnhancedDecoder {
107    /// Latin-1 to Unicode mapping
108    latin1_map: HashMap<u8, char>,
109    /// Windows-1252 to Unicode mapping
110    windows1252_map: HashMap<u8, char>,
111    /// MacRoman to Unicode mapping
112    macroman_map: HashMap<u8, char>,
113    /// Issue logger for analysis
114    issue_log: Vec<EncodingIssue>,
115}
116
117/// Information about encoding issues encountered
118#[derive(Debug, Clone)]
119pub struct EncodingIssue {
120    pub byte_value: u8,
121    pub context: String,
122    pub attempted_encodings: Vec<EncodingType>,
123    pub resolution: IssueResolution,
124}
125
126#[derive(Debug, Clone)]
127pub enum IssueResolution {
128    ReplacementCharacter,
129    SuccessfulConversion(char),
130    Skipped,
131}
132
133impl EnhancedDecoder {
134    /// Create a new enhanced decoder with all encoding tables loaded
135    pub fn new() -> Self {
136        let mut decoder = Self {
137            latin1_map: HashMap::new(),
138            windows1252_map: HashMap::new(),
139            macroman_map: HashMap::new(),
140            issue_log: Vec::new(),
141        };
142
143        decoder.initialize_encoding_tables();
144        decoder
145    }
146
147    /// Initialize all encoding conversion tables
148    fn initialize_encoding_tables(&mut self) {
149        // Latin-1 (ISO 8859-1) mapping - direct 1:1 mapping for 0x80-0xFF
150        for i in 0x80..=0xFF {
151            self.latin1_map.insert(i, char::from_u32(i as u32).unwrap());
152        }
153
154        // Windows-1252 mapping (extends Latin-1 for 0x80-0x9F range)
155        let windows1252_extensions = [
156            (0x80, '€'),        // Euro sign
157            (0x82, '‚'),        // Single low-9 quotation mark
158            (0x83, 'ƒ'),        // Latin small letter f with hook
159            (0x84, '„'),        // Double low-9 quotation mark
160            (0x85, '…'),        // Horizontal ellipsis
161            (0x86, '†'),        // Dagger
162            (0x87, '‡'),        // Double dagger
163            (0x88, 'ˆ'),        // Modifier letter circumflex accent
164            (0x89, '‰'),        // Per mille sign
165            (0x8A, 'Š'),        // Latin capital letter S with caron
166            (0x8B, '‹'),        // Single left-pointing angle quotation mark
167            (0x8C, 'Œ'),        // Latin capital ligature OE
168            (0x8E, 'Ž'),        // Latin capital letter Z with caron
169            (0x91, '\u{2018}'), // Left single quotation mark
170            (0x92, '\u{2019}'), // Right single quotation mark
171            (0x93, '\u{201C}'), // Left double quotation mark
172            (0x94, '\u{201D}'), // Right double quotation mark
173            (0x95, '•'),        // Bullet
174            (0x96, '–'),        // En dash
175            (0x97, '—'),        // Em dash
176            (0x98, '˜'),        // Small tilde
177            (0x99, '™'),        // Trade mark sign
178            (0x9A, 'š'),        // Latin small letter s with caron
179            (0x9B, '›'),        // Single right-pointing angle quotation mark
180            (0x9C, 'œ'),        // Latin small ligature oe
181            (0x9E, 'ž'),        // Latin small letter z with caron
182            (0x9F, 'Ÿ'),        // Latin capital letter Y with diaeresis
183        ];
184
185        // Copy Latin-1 base
186        self.windows1252_map = self.latin1_map.clone();
187        // Override with Windows-1252 extensions
188        for (byte, ch) in windows1252_extensions.iter() {
189            self.windows1252_map.insert(*byte, *ch);
190        }
191
192        // MacRoman mapping (partial - most common characters)
193        let macroman_chars = [
194            (0x80, 'Ä'),
195            (0x81, 'Å'),
196            (0x82, 'Ç'),
197            (0x83, 'É'),
198            (0x84, 'Ñ'),
199            (0x85, 'Ö'),
200            (0x86, 'Ü'),
201            (0x87, 'á'),
202            (0x88, 'à'),
203            (0x89, 'â'),
204            (0x8A, 'ä'),
205            (0x8B, 'ã'),
206            (0x8C, 'å'),
207            (0x8D, 'ç'),
208            (0x8E, 'é'),
209            (0x8F, 'è'),
210            (0x90, 'ê'),
211            (0x91, 'ë'),
212            (0x92, 'í'),
213            (0x93, 'ì'),
214            (0x94, 'î'),
215            (0x95, 'ï'),
216            (0x96, 'ñ'),
217            (0x97, 'ó'),
218            (0x98, 'ò'),
219            (0x99, 'ô'),
220            (0x9A, 'ö'),
221            (0x9B, 'õ'),
222            (0x9C, 'ú'),
223            (0x9D, 'ù'),
224            (0x9E, 'û'),
225            (0x9F, 'ü'),
226            (0xA0, '†'),
227            (0xA1, '°'),
228            (0xA2, '¢'),
229            (0xA3, '£'),
230            (0xA4, '§'),
231            (0xA5, '•'),
232            (0xA6, '¶'),
233            (0xA7, 'ß'),
234            (0xA8, '®'),
235            (0xA9, '©'),
236            (0xAA, '™'),
237            (0xAB, '´'),
238            (0xAC, '¨'),
239            (0xAD, '≠'),
240            (0xAE, 'Æ'),
241            (0xAF, 'Ø'),
242        ];
243
244        for (byte, ch) in macroman_chars.iter() {
245            self.macroman_map.insert(*byte, *ch);
246        }
247    }
248
249    /// Clear the issue log
250    pub fn clear_log(&mut self) {
251        self.issue_log.clear();
252    }
253
254    /// Get the current issue log
255    pub fn get_issues(&self) -> &[EncodingIssue] {
256        &self.issue_log
257    }
258
259    /// Log an encoding issue
260    #[allow(dead_code)]
261    fn log_issue(&mut self, issue: EncodingIssue) {
262        self.issue_log.push(issue);
263    }
264
265    /// Analyze bytes to detect most likely encoding
266    fn analyze_encoding_indicators(&self, bytes: &[u8]) -> Vec<(EncodingType, f64)> {
267        let mut scores = vec![
268            (EncodingType::Utf8, 0.0),
269            (EncodingType::Latin1, 0.0),
270            (EncodingType::Windows1252, 0.0),
271            (EncodingType::MacRoman, 0.0),
272        ];
273
274        // UTF-8 validity check
275        if std::str::from_utf8(bytes).is_ok() {
276            scores[0].1 = 0.9; // High confidence for valid UTF-8
277        }
278
279        // Check for Windows-1252 specific characters
280        let mut windows1252_indicators = 0;
281        let mut latin1_indicators = 0;
282        let mut macroman_indicators = 0;
283
284        for &byte in bytes {
285            if byte >= 0x80 {
286                // Count high-bit characters
287                if self.windows1252_map.contains_key(&byte) {
288                    windows1252_indicators += 1;
289                    // Special boost for Windows-1252 specific chars
290                    if matches!(byte, 0x80 | 0x82..=0x8C | 0x8E | 0x91..=0x9C | 0x9E | 0x9F) {
291                        scores[2].1 += 0.1;
292                    }
293                }
294                if self.latin1_map.contains_key(&byte) {
295                    latin1_indicators += 1;
296                }
297                if self.macroman_map.contains_key(&byte) {
298                    macroman_indicators += 1;
299                }
300            }
301        }
302
303        // Adjust scores based on indicators
304        if windows1252_indicators > 0 {
305            scores[2].1 += 0.3;
306        }
307        if latin1_indicators > 0 {
308            scores[1].1 += 0.2;
309        }
310        if macroman_indicators > 0 {
311            scores[3].1 += 0.1;
312        }
313
314        // Sort by confidence score
315        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
316        scores
317    }
318}
319
320impl Default for EnhancedDecoder {
321    fn default() -> Self {
322        Self::new()
323    }
324}
325
326impl CharacterDecoder for EnhancedDecoder {
327    fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError> {
328        // Try preferred encoding first
329        if let Some(preferred) = options.preferred_encoding {
330            if let Ok(text) = self.decode_with_encoding(bytes, preferred, options.lenient_mode) {
331                let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
332                return Ok(EncodingResult {
333                    text,
334                    detected_encoding: Some(preferred),
335                    replacement_count,
336                    confidence: 0.8,
337                });
338            }
339        }
340
341        // Auto-detect encoding
342        let encoding_candidates = self.analyze_encoding_indicators(bytes);
343
344        for (encoding, confidence) in encoding_candidates {
345            if confidence > 0.1 {
346                match self.decode_with_encoding(bytes, encoding, options.lenient_mode) {
347                    Ok(text) => {
348                        let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
349
350                        if replacement_count <= options.max_replacements {
351                            return Ok(EncodingResult {
352                                text,
353                                detected_encoding: Some(encoding),
354                                replacement_count,
355                                confidence,
356                            });
357                        }
358                    }
359                    Err(_) => continue,
360                }
361            }
362        }
363
364        // Last resort: UTF-8 with replacement
365        if options.lenient_mode {
366            let text = String::from_utf8_lossy(bytes).to_string();
367            let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
368
369            Ok(EncodingResult {
370                text,
371                detected_encoding: Some(EncodingType::Mixed),
372                replacement_count,
373                confidence: 0.1,
374            })
375        } else {
376            Err(PdfError::EncodingError(
377                "Failed to decode text with any supported encoding".to_string(),
378            ))
379        }
380    }
381
382    fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType> {
383        let candidates = self.analyze_encoding_indicators(bytes);
384        candidates.first().map(|(encoding, _)| *encoding)
385    }
386
387    fn decode_with_encoding(
388        &self,
389        bytes: &[u8],
390        encoding: EncodingType,
391        lenient: bool,
392    ) -> Result<String, PdfError> {
393        match encoding {
394            EncodingType::Utf8 => {
395                if lenient {
396                    Ok(String::from_utf8_lossy(bytes).to_string())
397                } else {
398                    String::from_utf8(bytes.to_vec())
399                        .map_err(|e| PdfError::EncodingError(format!("UTF-8 decoding failed: {e}")))
400                }
401            }
402
403            EncodingType::Latin1 => {
404                let mut result = String::with_capacity(bytes.len());
405                for &byte in bytes {
406                    if byte < 0x80 {
407                        result.push(byte as char);
408                    } else if let Some(&ch) = self.latin1_map.get(&byte) {
409                        result.push(ch);
410                    } else if lenient {
411                        result.push('\u{FFFD}');
412                    } else {
413                        return Err(PdfError::EncodingError(format!(
414                            "Invalid Latin-1 character: 0x{byte:02X}"
415                        )));
416                    }
417                }
418                Ok(result)
419            }
420
421            EncodingType::Windows1252 => {
422                let mut result = String::with_capacity(bytes.len());
423                for &byte in bytes {
424                    if byte < 0x80 {
425                        result.push(byte as char);
426                    } else if let Some(&ch) = self.windows1252_map.get(&byte) {
427                        result.push(ch);
428                    } else if lenient {
429                        result.push('\u{FFFD}');
430                    } else {
431                        return Err(PdfError::EncodingError(format!(
432                            "Invalid Windows-1252 character: 0x{byte:02X}"
433                        )));
434                    }
435                }
436                Ok(result)
437            }
438
439            EncodingType::MacRoman => {
440                let mut result = String::with_capacity(bytes.len());
441                for &byte in bytes {
442                    if byte < 0x80 {
443                        result.push(byte as char);
444                    } else if let Some(&ch) = self.macroman_map.get(&byte) {
445                        result.push(ch);
446                    } else if lenient {
447                        result.push('\u{FFFD}');
448                    } else {
449                        return Err(PdfError::EncodingError(format!(
450                            "Invalid MacRoman character: 0x{byte:02X}"
451                        )));
452                    }
453                }
454                Ok(result)
455            }
456
457            EncodingType::PdfDocEncoding => {
458                // PDFDocEncoding is identical to Latin-1 for now
459                self.decode_with_encoding(bytes, EncodingType::Latin1, lenient)
460            }
461
462            EncodingType::Mixed => {
463                // Try multiple encodings and pick the best result
464                let candidates = [
465                    EncodingType::Utf8,
466                    EncodingType::Windows1252,
467                    EncodingType::Latin1,
468                    EncodingType::MacRoman,
469                ];
470
471                for candidate in &candidates {
472                    if let Ok(result) = self.decode_with_encoding(bytes, *candidate, true) {
473                        let replacement_count = result.chars().filter(|&c| c == '\u{FFFD}').count();
474                        if replacement_count < bytes.len() / 4 {
475                            // Less than 25% replacement chars
476                            return Ok(result);
477                        }
478                    }
479                }
480
481                // Fallback to UTF-8 lossy
482                Ok(String::from_utf8_lossy(bytes).to_string())
483            }
484        }
485    }
486}
487
488/// Convenience function to decode bytes with default settings
489pub fn decode_text(bytes: &[u8]) -> Result<String, PdfError> {
490    let decoder = EnhancedDecoder::new();
491    let options = EncodingOptions::default();
492    let result = decoder.decode(bytes, &options)?;
493    Ok(result.text)
494}
495
496/// Convenience function to decode bytes with specific encoding
497pub fn decode_text_with_encoding(bytes: &[u8], encoding: EncodingType) -> Result<String, PdfError> {
498    let decoder = EnhancedDecoder::new();
499    decoder.decode_with_encoding(bytes, encoding, true)
500}
501
502#[cfg(test)]
503mod tests {
504    use super::*;
505
506    #[test]
507    fn test_utf8_decoding() {
508        let decoder = EnhancedDecoder::new();
509        let options = EncodingOptions::default();
510
511        let utf8_text = "Hello, 世界!";
512        let bytes = utf8_text.as_bytes();
513
514        let result = decoder.decode(bytes, &options).unwrap();
515        assert_eq!(result.text, utf8_text);
516        assert_eq!(result.detected_encoding, Some(EncodingType::Utf8));
517        assert_eq!(result.replacement_count, 0);
518    }
519
520    #[test]
521    fn test_latin1_decoding() {
522        let decoder = EnhancedDecoder::new();
523        let options = EncodingOptions::default();
524
525        // Latin-1 text with accented characters
526        let bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2C, 0x20, 0xE9, 0xE8, 0xE7]; // "Hello, éèç"
527
528        let result = decoder.decode(&bytes, &options).unwrap();
529        assert!(result.text.contains("éèç"));
530    }
531
532    #[test]
533    fn test_windows1252_decoding() {
534        let decoder = EnhancedDecoder::new();
535        let options = EncodingOptions::default();
536
537        // Windows-1252 text with Euro sign and smart quotes
538        let bytes = vec![0x80, 0x20, 0x91, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x92]; // "€ 'Hello'"
539
540        let result = decoder.decode(&bytes, &options).unwrap();
541        assert!(result.text.contains("€"));
542        assert!(result.text.contains('\u{2018}')); // Left single quote
543        assert!(result.text.contains('\u{2019}')); // Right single quote
544    }
545
546    #[test]
547    fn test_lenient_mode() {
548        let decoder = EnhancedDecoder::new();
549        let mut options = EncodingOptions::default();
550        options.lenient_mode = true;
551        options.preferred_encoding = Some(EncodingType::Utf8); // Force UTF-8 to get replacement chars
552
553        // Invalid UTF-8 sequence (will cause replacement chars in UTF-8 mode)
554        let bytes = vec![0xFF, 0xFE, 0x48, 0x65, 0x6C, 0x6C, 0x6F]; // Invalid UTF-8 + "Hello"
555
556        let result = decoder.decode(&bytes, &options).unwrap();
557        assert!(
558            result.replacement_count > 0,
559            "Expected replacement chars, got {}",
560            result.replacement_count
561        );
562        assert!(result.text.contains("Hello"));
563    }
564
565    #[test]
566    fn test_encoding_detection() {
567        let decoder = EnhancedDecoder::new();
568
569        // UTF-8
570        let utf8_bytes = "Hello, 世界!".as_bytes();
571        assert_eq!(
572            decoder.detect_encoding(utf8_bytes),
573            Some(EncodingType::Utf8)
574        );
575
576        // Windows-1252 with Euro sign
577        let win1252_bytes = vec![0x80, 0x20, 0x48, 0x65, 0x6C, 0x6C, 0x6F];
578        let detected = decoder.detect_encoding(&win1252_bytes);
579        assert!(matches!(
580            detected,
581            Some(EncodingType::Windows1252) | Some(EncodingType::Latin1)
582        ));
583    }
584
585    #[test]
586    fn test_specific_encoding() {
587        let decoder = EnhancedDecoder::new();
588
589        let bytes = vec![0xC9]; // É in Latin-1
590
591        let latin1_result = decoder
592            .decode_with_encoding(&bytes, EncodingType::Latin1, false)
593            .unwrap();
594        assert_eq!(latin1_result, "É");
595
596        let win1252_result = decoder
597            .decode_with_encoding(&bytes, EncodingType::Windows1252, false)
598            .unwrap();
599        assert_eq!(win1252_result, "É");
600    }
601
602    #[test]
603    fn test_convenience_functions() {
604        let utf8_text = "Hello, world!";
605        let bytes = utf8_text.as_bytes();
606
607        let decoded = decode_text(bytes).unwrap();
608        assert_eq!(decoded, utf8_text);
609
610        let latin1_bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0xE9]; // "Hellé"
611        let decoded = decode_text_with_encoding(&latin1_bytes, EncodingType::Latin1).unwrap();
612        assert!(decoded.contains("é"));
613    }
614}