1use crate::error::PdfError;
18use std::collections::HashMap;
19
20#[derive(Debug, Clone)]
22pub struct EncodingResult {
23 pub text: String,
25 pub detected_encoding: Option<EncodingType>,
27 pub replacement_count: usize,
29 pub confidence: f64,
31}
32
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum EncodingType {
36 Utf8,
38 Latin1,
40 Windows1252,
42 MacRoman,
44 PdfDocEncoding,
46 Mixed,
48}
49
50impl EncodingType {
51 pub fn name(&self) -> &'static str {
53 match self {
54 EncodingType::Utf8 => "UTF-8",
55 EncodingType::Latin1 => "ISO 8859-1 (Latin-1)",
56 EncodingType::Windows1252 => "Windows-1252",
57 EncodingType::MacRoman => "MacRoman",
58 EncodingType::PdfDocEncoding => "PDFDocEncoding",
59 EncodingType::Mixed => "Mixed/Unknown",
60 }
61 }
62}
63
64#[derive(Debug, Clone)]
66pub struct EncodingOptions {
67 pub lenient_mode: bool,
69 pub preferred_encoding: Option<EncodingType>,
71 pub max_replacements: usize,
73 pub log_issues: bool,
75}
76
77impl Default for EncodingOptions {
78 fn default() -> Self {
79 Self {
80 lenient_mode: true,
81 preferred_encoding: None,
82 max_replacements: 100,
83 log_issues: false,
84 }
85 }
86}
87
88pub trait CharacterDecoder {
90 fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError>;
92
93 fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType>;
95
96 fn decode_with_encoding(
98 &self,
99 bytes: &[u8],
100 encoding: EncodingType,
101 lenient: bool,
102 ) -> Result<String, PdfError>;
103}
104
105pub struct EnhancedDecoder {
107 latin1_map: HashMap<u8, char>,
109 windows1252_map: HashMap<u8, char>,
111 macroman_map: HashMap<u8, char>,
113 issue_log: Vec<EncodingIssue>,
115}
116
117#[derive(Debug, Clone)]
119pub struct EncodingIssue {
120 pub byte_value: u8,
121 pub context: String,
122 pub attempted_encodings: Vec<EncodingType>,
123 pub resolution: IssueResolution,
124}
125
126#[derive(Debug, Clone)]
127pub enum IssueResolution {
128 ReplacementCharacter,
129 SuccessfulConversion(char),
130 Skipped,
131}
132
133impl EnhancedDecoder {
134 pub fn new() -> Self {
136 let mut decoder = Self {
137 latin1_map: HashMap::new(),
138 windows1252_map: HashMap::new(),
139 macroman_map: HashMap::new(),
140 issue_log: Vec::new(),
141 };
142
143 decoder.initialize_encoding_tables();
144 decoder
145 }
146
147 fn initialize_encoding_tables(&mut self) {
149 for i in 0x80..=0xFF {
151 self.latin1_map.insert(i, char::from_u32(i as u32).unwrap());
152 }
153
154 let windows1252_extensions = [
156 (0x80, '€'), (0x82, '‚'), (0x83, 'ƒ'), (0x84, '„'), (0x85, '…'), (0x86, '†'), (0x87, '‡'), (0x88, 'ˆ'), (0x89, '‰'), (0x8A, 'Š'), (0x8B, '‹'), (0x8C, 'Œ'), (0x8E, 'Ž'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201C}'), (0x94, '\u{201D}'), (0x95, '•'), (0x96, '–'), (0x97, '—'), (0x98, '˜'), (0x99, '™'), (0x9A, 'š'), (0x9B, '›'), (0x9C, 'œ'), (0x9E, 'ž'), (0x9F, 'Ÿ'), ];
184
185 self.windows1252_map = self.latin1_map.clone();
187 for (byte, ch) in windows1252_extensions.iter() {
189 self.windows1252_map.insert(*byte, *ch);
190 }
191
192 let macroman_chars = [
194 (0x80, 'Ä'),
195 (0x81, 'Å'),
196 (0x82, 'Ç'),
197 (0x83, 'É'),
198 (0x84, 'Ñ'),
199 (0x85, 'Ö'),
200 (0x86, 'Ü'),
201 (0x87, 'á'),
202 (0x88, 'à'),
203 (0x89, 'â'),
204 (0x8A, 'ä'),
205 (0x8B, 'ã'),
206 (0x8C, 'å'),
207 (0x8D, 'ç'),
208 (0x8E, 'é'),
209 (0x8F, 'è'),
210 (0x90, 'ê'),
211 (0x91, 'ë'),
212 (0x92, 'í'),
213 (0x93, 'ì'),
214 (0x94, 'î'),
215 (0x95, 'ï'),
216 (0x96, 'ñ'),
217 (0x97, 'ó'),
218 (0x98, 'ò'),
219 (0x99, 'ô'),
220 (0x9A, 'ö'),
221 (0x9B, 'õ'),
222 (0x9C, 'ú'),
223 (0x9D, 'ù'),
224 (0x9E, 'û'),
225 (0x9F, 'ü'),
226 (0xA0, '†'),
227 (0xA1, '°'),
228 (0xA2, '¢'),
229 (0xA3, '£'),
230 (0xA4, '§'),
231 (0xA5, '•'),
232 (0xA6, '¶'),
233 (0xA7, 'ß'),
234 (0xA8, '®'),
235 (0xA9, '©'),
236 (0xAA, '™'),
237 (0xAB, '´'),
238 (0xAC, '¨'),
239 (0xAD, '≠'),
240 (0xAE, 'Æ'),
241 (0xAF, 'Ø'),
242 ];
243
244 for (byte, ch) in macroman_chars.iter() {
245 self.macroman_map.insert(*byte, *ch);
246 }
247 }
248
249 pub fn clear_log(&mut self) {
251 self.issue_log.clear();
252 }
253
254 pub fn get_issues(&self) -> &[EncodingIssue] {
256 &self.issue_log
257 }
258
259 #[allow(dead_code)]
261 fn log_issue(&mut self, issue: EncodingIssue) {
262 self.issue_log.push(issue);
263 }
264
265 fn analyze_encoding_indicators(&self, bytes: &[u8]) -> Vec<(EncodingType, f64)> {
267 let mut scores = vec![
268 (EncodingType::Utf8, 0.0),
269 (EncodingType::Latin1, 0.0),
270 (EncodingType::Windows1252, 0.0),
271 (EncodingType::MacRoman, 0.0),
272 ];
273
274 if std::str::from_utf8(bytes).is_ok() {
276 scores[0].1 = 0.9; }
278
279 let mut windows1252_indicators = 0;
281 let mut latin1_indicators = 0;
282 let mut macroman_indicators = 0;
283
284 for &byte in bytes {
285 if byte >= 0x80 {
286 if self.windows1252_map.contains_key(&byte) {
288 windows1252_indicators += 1;
289 if matches!(byte, 0x80 | 0x82..=0x8C | 0x8E | 0x91..=0x9C | 0x9E | 0x9F) {
291 scores[2].1 += 0.1;
292 }
293 }
294 if self.latin1_map.contains_key(&byte) {
295 latin1_indicators += 1;
296 }
297 if self.macroman_map.contains_key(&byte) {
298 macroman_indicators += 1;
299 }
300 }
301 }
302
303 if windows1252_indicators > 0 {
305 scores[2].1 += 0.3;
306 }
307 if latin1_indicators > 0 {
308 scores[1].1 += 0.2;
309 }
310 if macroman_indicators > 0 {
311 scores[3].1 += 0.1;
312 }
313
314 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
316 scores
317 }
318}
319
320impl Default for EnhancedDecoder {
321 fn default() -> Self {
322 Self::new()
323 }
324}
325
326impl CharacterDecoder for EnhancedDecoder {
327 fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError> {
328 if let Some(preferred) = options.preferred_encoding {
330 if let Ok(text) = self.decode_with_encoding(bytes, preferred, options.lenient_mode) {
331 let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
332 return Ok(EncodingResult {
333 text,
334 detected_encoding: Some(preferred),
335 replacement_count,
336 confidence: 0.8,
337 });
338 }
339 }
340
341 let encoding_candidates = self.analyze_encoding_indicators(bytes);
343
344 for (encoding, confidence) in encoding_candidates {
345 if confidence > 0.1 {
346 match self.decode_with_encoding(bytes, encoding, options.lenient_mode) {
347 Ok(text) => {
348 let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
349
350 if replacement_count <= options.max_replacements {
351 return Ok(EncodingResult {
352 text,
353 detected_encoding: Some(encoding),
354 replacement_count,
355 confidence,
356 });
357 }
358 }
359 Err(_) => continue,
360 }
361 }
362 }
363
364 if options.lenient_mode {
366 let text = String::from_utf8_lossy(bytes).to_string();
367 let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
368
369 Ok(EncodingResult {
370 text,
371 detected_encoding: Some(EncodingType::Mixed),
372 replacement_count,
373 confidence: 0.1,
374 })
375 } else {
376 Err(PdfError::EncodingError(
377 "Failed to decode text with any supported encoding".to_string(),
378 ))
379 }
380 }
381
382 fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType> {
383 let candidates = self.analyze_encoding_indicators(bytes);
384 candidates.first().map(|(encoding, _)| *encoding)
385 }
386
387 fn decode_with_encoding(
388 &self,
389 bytes: &[u8],
390 encoding: EncodingType,
391 lenient: bool,
392 ) -> Result<String, PdfError> {
393 match encoding {
394 EncodingType::Utf8 => {
395 if lenient {
396 Ok(String::from_utf8_lossy(bytes).to_string())
397 } else {
398 String::from_utf8(bytes.to_vec())
399 .map_err(|e| PdfError::EncodingError(format!("UTF-8 decoding failed: {e}")))
400 }
401 }
402
403 EncodingType::Latin1 => {
404 let mut result = String::with_capacity(bytes.len());
405 for &byte in bytes {
406 if byte < 0x80 {
407 result.push(byte as char);
408 } else if let Some(&ch) = self.latin1_map.get(&byte) {
409 result.push(ch);
410 } else if lenient {
411 result.push('\u{FFFD}');
412 } else {
413 return Err(PdfError::EncodingError(format!(
414 "Invalid Latin-1 character: 0x{byte:02X}"
415 )));
416 }
417 }
418 Ok(result)
419 }
420
421 EncodingType::Windows1252 => {
422 let mut result = String::with_capacity(bytes.len());
423 for &byte in bytes {
424 if byte < 0x80 {
425 result.push(byte as char);
426 } else if let Some(&ch) = self.windows1252_map.get(&byte) {
427 result.push(ch);
428 } else if lenient {
429 result.push('\u{FFFD}');
430 } else {
431 return Err(PdfError::EncodingError(format!(
432 "Invalid Windows-1252 character: 0x{byte:02X}"
433 )));
434 }
435 }
436 Ok(result)
437 }
438
439 EncodingType::MacRoman => {
440 let mut result = String::with_capacity(bytes.len());
441 for &byte in bytes {
442 if byte < 0x80 {
443 result.push(byte as char);
444 } else if let Some(&ch) = self.macroman_map.get(&byte) {
445 result.push(ch);
446 } else if lenient {
447 result.push('\u{FFFD}');
448 } else {
449 return Err(PdfError::EncodingError(format!(
450 "Invalid MacRoman character: 0x{byte:02X}"
451 )));
452 }
453 }
454 Ok(result)
455 }
456
457 EncodingType::PdfDocEncoding => {
458 self.decode_with_encoding(bytes, EncodingType::Latin1, lenient)
460 }
461
462 EncodingType::Mixed => {
463 let candidates = [
465 EncodingType::Utf8,
466 EncodingType::Windows1252,
467 EncodingType::Latin1,
468 EncodingType::MacRoman,
469 ];
470
471 for candidate in &candidates {
472 if let Ok(result) = self.decode_with_encoding(bytes, *candidate, true) {
473 let replacement_count = result.chars().filter(|&c| c == '\u{FFFD}').count();
474 if replacement_count < bytes.len() / 4 {
475 return Ok(result);
477 }
478 }
479 }
480
481 Ok(String::from_utf8_lossy(bytes).to_string())
483 }
484 }
485 }
486}
487
488pub fn decode_text(bytes: &[u8]) -> Result<String, PdfError> {
490 let decoder = EnhancedDecoder::new();
491 let options = EncodingOptions::default();
492 let result = decoder.decode(bytes, &options)?;
493 Ok(result.text)
494}
495
496pub fn decode_text_with_encoding(bytes: &[u8], encoding: EncodingType) -> Result<String, PdfError> {
498 let decoder = EnhancedDecoder::new();
499 decoder.decode_with_encoding(bytes, encoding, true)
500}
501
502#[cfg(test)]
503mod tests {
504 use super::*;
505
506 #[test]
507 fn test_utf8_decoding() {
508 let decoder = EnhancedDecoder::new();
509 let options = EncodingOptions::default();
510
511 let utf8_text = "Hello, 世界!";
512 let bytes = utf8_text.as_bytes();
513
514 let result = decoder.decode(bytes, &options).unwrap();
515 assert_eq!(result.text, utf8_text);
516 assert_eq!(result.detected_encoding, Some(EncodingType::Utf8));
517 assert_eq!(result.replacement_count, 0);
518 }
519
520 #[test]
521 fn test_latin1_decoding() {
522 let decoder = EnhancedDecoder::new();
523 let options = EncodingOptions::default();
524
525 let bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2C, 0x20, 0xE9, 0xE8, 0xE7]; let result = decoder.decode(&bytes, &options).unwrap();
529 assert!(result.text.contains("éèç"));
530 }
531
532 #[test]
533 fn test_windows1252_decoding() {
534 let decoder = EnhancedDecoder::new();
535 let options = EncodingOptions::default();
536
537 let bytes = vec![0x80, 0x20, 0x91, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x92]; let result = decoder.decode(&bytes, &options).unwrap();
541 assert!(result.text.contains("€"));
542 assert!(result.text.contains('\u{2018}')); assert!(result.text.contains('\u{2019}')); }
545
546 #[test]
547 fn test_lenient_mode() {
548 let decoder = EnhancedDecoder::new();
549 let mut options = EncodingOptions::default();
550 options.lenient_mode = true;
551 options.preferred_encoding = Some(EncodingType::Utf8); let bytes = vec![0xFF, 0xFE, 0x48, 0x65, 0x6C, 0x6C, 0x6F]; let result = decoder.decode(&bytes, &options).unwrap();
557 assert!(
558 result.replacement_count > 0,
559 "Expected replacement chars, got {}",
560 result.replacement_count
561 );
562 assert!(result.text.contains("Hello"));
563 }
564
565 #[test]
566 fn test_encoding_detection() {
567 let decoder = EnhancedDecoder::new();
568
569 let utf8_bytes = "Hello, 世界!".as_bytes();
571 assert_eq!(
572 decoder.detect_encoding(utf8_bytes),
573 Some(EncodingType::Utf8)
574 );
575
576 let win1252_bytes = vec![0x80, 0x20, 0x48, 0x65, 0x6C, 0x6C, 0x6F];
578 let detected = decoder.detect_encoding(&win1252_bytes);
579 assert!(matches!(
580 detected,
581 Some(EncodingType::Windows1252) | Some(EncodingType::Latin1)
582 ));
583 }
584
585 #[test]
586 fn test_specific_encoding() {
587 let decoder = EnhancedDecoder::new();
588
589 let bytes = vec![0xC9]; let latin1_result = decoder
592 .decode_with_encoding(&bytes, EncodingType::Latin1, false)
593 .unwrap();
594 assert_eq!(latin1_result, "É");
595
596 let win1252_result = decoder
597 .decode_with_encoding(&bytes, EncodingType::Windows1252, false)
598 .unwrap();
599 assert_eq!(win1252_result, "É");
600 }
601
602 #[test]
603 fn test_convenience_functions() {
604 let utf8_text = "Hello, world!";
605 let bytes = utf8_text.as_bytes();
606
607 let decoded = decode_text(bytes).unwrap();
608 assert_eq!(decoded, utf8_text);
609
610 let latin1_bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0xE9]; let decoded = decode_text_with_encoding(&latin1_bytes, EncodingType::Latin1).unwrap();
612 assert!(decoded.contains("é"));
613 }
614}