1use crate::error::PdfError;
18use std::collections::HashMap;
19
20#[derive(Debug, Clone)]
22pub struct EncodingResult {
23 pub text: String,
25 pub detected_encoding: Option<EncodingType>,
27 pub replacement_count: usize,
29 pub confidence: f64,
31}
32
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35pub enum EncodingType {
36 Utf8,
38 Latin1,
40 Windows1252,
42 MacRoman,
44 PdfDocEncoding,
46 Mixed,
48}
49
50impl EncodingType {
51 pub fn name(&self) -> &'static str {
53 match self {
54 EncodingType::Utf8 => "UTF-8",
55 EncodingType::Latin1 => "ISO 8859-1 (Latin-1)",
56 EncodingType::Windows1252 => "Windows-1252",
57 EncodingType::MacRoman => "MacRoman",
58 EncodingType::PdfDocEncoding => "PDFDocEncoding",
59 EncodingType::Mixed => "Mixed/Unknown",
60 }
61 }
62}
63
64#[derive(Debug, Clone)]
66pub struct EncodingOptions {
67 pub lenient_mode: bool,
69 pub preferred_encoding: Option<EncodingType>,
71 pub max_replacements: usize,
73 pub log_issues: bool,
75}
76
77impl Default for EncodingOptions {
78 fn default() -> Self {
79 Self {
80 lenient_mode: true,
81 preferred_encoding: None,
82 max_replacements: 100,
83 log_issues: false,
84 }
85 }
86}
87
88pub trait CharacterDecoder {
90 fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError>;
92
93 fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType>;
95
96 fn decode_with_encoding(
98 &self,
99 bytes: &[u8],
100 encoding: EncodingType,
101 lenient: bool,
102 ) -> Result<String, PdfError>;
103}
104
105pub struct EnhancedDecoder {
107 latin1_map: HashMap<u8, char>,
109 windows1252_map: HashMap<u8, char>,
111 macroman_map: HashMap<u8, char>,
113 issue_log: Vec<EncodingIssue>,
115}
116
117#[derive(Debug, Clone)]
119pub struct EncodingIssue {
120 pub byte_value: u8,
121 pub context: String,
122 pub attempted_encodings: Vec<EncodingType>,
123 pub resolution: IssueResolution,
124}
125
126#[derive(Debug, Clone)]
127pub enum IssueResolution {
128 ReplacementCharacter,
129 SuccessfulConversion(char),
130 Skipped,
131}
132
133impl EnhancedDecoder {
134 pub fn new() -> Self {
136 let mut decoder = Self {
137 latin1_map: HashMap::new(),
138 windows1252_map: HashMap::new(),
139 macroman_map: HashMap::new(),
140 issue_log: Vec::new(),
141 };
142
143 decoder.initialize_encoding_tables();
144 decoder
145 }
146
147 fn initialize_encoding_tables(&mut self) {
149 for i in 0x80..=0xFF {
151 if let Some(ch) = char::from_u32(i as u32) {
154 self.latin1_map.insert(i, ch);
155 }
156 }
157
158 let windows1252_extensions = [
160 (0x80, '€'), (0x82, '‚'), (0x83, 'ƒ'), (0x84, '„'), (0x85, '…'), (0x86, '†'), (0x87, '‡'), (0x88, 'ˆ'), (0x89, '‰'), (0x8A, 'Š'), (0x8B, '‹'), (0x8C, 'Œ'), (0x8E, 'Ž'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201C}'), (0x94, '\u{201D}'), (0x95, '•'), (0x96, '–'), (0x97, '—'), (0x98, '˜'), (0x99, '™'), (0x9A, 'š'), (0x9B, '›'), (0x9C, 'œ'), (0x9E, 'ž'), (0x9F, 'Ÿ'), ];
188
189 self.windows1252_map = self.latin1_map.clone();
191 for (byte, ch) in windows1252_extensions.iter() {
193 self.windows1252_map.insert(*byte, *ch);
194 }
195
196 let macroman_chars = [
198 (0x80, 'Ä'),
199 (0x81, 'Å'),
200 (0x82, 'Ç'),
201 (0x83, 'É'),
202 (0x84, 'Ñ'),
203 (0x85, 'Ö'),
204 (0x86, 'Ü'),
205 (0x87, 'á'),
206 (0x88, 'à'),
207 (0x89, 'â'),
208 (0x8A, 'ä'),
209 (0x8B, 'ã'),
210 (0x8C, 'å'),
211 (0x8D, 'ç'),
212 (0x8E, 'é'),
213 (0x8F, 'è'),
214 (0x90, 'ê'),
215 (0x91, 'ë'),
216 (0x92, 'í'),
217 (0x93, 'ì'),
218 (0x94, 'î'),
219 (0x95, 'ï'),
220 (0x96, 'ñ'),
221 (0x97, 'ó'),
222 (0x98, 'ò'),
223 (0x99, 'ô'),
224 (0x9A, 'ö'),
225 (0x9B, 'õ'),
226 (0x9C, 'ú'),
227 (0x9D, 'ù'),
228 (0x9E, 'û'),
229 (0x9F, 'ü'),
230 (0xA0, '†'),
231 (0xA1, '°'),
232 (0xA2, '¢'),
233 (0xA3, '£'),
234 (0xA4, '§'),
235 (0xA5, '•'),
236 (0xA6, '¶'),
237 (0xA7, 'ß'),
238 (0xA8, '®'),
239 (0xA9, '©'),
240 (0xAA, '™'),
241 (0xAB, '´'),
242 (0xAC, '¨'),
243 (0xAD, '≠'),
244 (0xAE, 'Æ'),
245 (0xAF, 'Ø'),
246 ];
247
248 for (byte, ch) in macroman_chars.iter() {
249 self.macroman_map.insert(*byte, *ch);
250 }
251 }
252
253 pub fn clear_log(&mut self) {
255 self.issue_log.clear();
256 }
257
258 pub fn get_issues(&self) -> &[EncodingIssue] {
260 &self.issue_log
261 }
262
263 #[allow(dead_code)]
265 fn log_issue(&mut self, issue: EncodingIssue) {
266 self.issue_log.push(issue);
267 }
268
269 fn analyze_encoding_indicators(&self, bytes: &[u8]) -> Vec<(EncodingType, f64)> {
271 let mut scores = vec![
272 (EncodingType::Utf8, 0.0),
273 (EncodingType::Latin1, 0.0),
274 (EncodingType::Windows1252, 0.0),
275 (EncodingType::MacRoman, 0.0),
276 ];
277
278 if std::str::from_utf8(bytes).is_ok() {
280 scores[0].1 = 0.9; }
282
283 let mut windows1252_indicators = 0;
285 let mut latin1_indicators = 0;
286 let mut macroman_indicators = 0;
287
288 for &byte in bytes {
289 if byte >= 0x80 {
290 if self.windows1252_map.contains_key(&byte) {
292 windows1252_indicators += 1;
293 if matches!(byte, 0x80 | 0x82..=0x8C | 0x8E | 0x91..=0x9C | 0x9E | 0x9F) {
295 scores[2].1 += 0.1;
296 }
297 }
298 if self.latin1_map.contains_key(&byte) {
299 latin1_indicators += 1;
300 }
301 if self.macroman_map.contains_key(&byte) {
302 macroman_indicators += 1;
303 }
304 }
305 }
306
307 if windows1252_indicators > 0 {
309 scores[2].1 += 0.3;
310 }
311 if latin1_indicators > 0 {
312 scores[1].1 += 0.2;
313 }
314 if macroman_indicators > 0 {
315 scores[3].1 += 0.1;
316 }
317
318 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
320 scores
321 }
322}
323
324impl Default for EnhancedDecoder {
325 fn default() -> Self {
326 Self::new()
327 }
328}
329
330impl CharacterDecoder for EnhancedDecoder {
331 fn decode(&self, bytes: &[u8], options: &EncodingOptions) -> Result<EncodingResult, PdfError> {
332 if let Some(preferred) = options.preferred_encoding {
334 if let Ok(text) = self.decode_with_encoding(bytes, preferred, options.lenient_mode) {
335 let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
336 return Ok(EncodingResult {
337 text,
338 detected_encoding: Some(preferred),
339 replacement_count,
340 confidence: 0.8,
341 });
342 }
343 }
344
345 let encoding_candidates = self.analyze_encoding_indicators(bytes);
347
348 for (encoding, confidence) in encoding_candidates {
349 if confidence > 0.1 {
350 match self.decode_with_encoding(bytes, encoding, options.lenient_mode) {
351 Ok(text) => {
352 let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
353
354 if replacement_count <= options.max_replacements {
355 return Ok(EncodingResult {
356 text,
357 detected_encoding: Some(encoding),
358 replacement_count,
359 confidence,
360 });
361 }
362 }
363 Err(_) => continue,
364 }
365 }
366 }
367
368 if options.lenient_mode {
370 let text = String::from_utf8_lossy(bytes).to_string();
371 let replacement_count = text.chars().filter(|&c| c == '\u{FFFD}').count();
372
373 Ok(EncodingResult {
374 text,
375 detected_encoding: Some(EncodingType::Mixed),
376 replacement_count,
377 confidence: 0.1,
378 })
379 } else {
380 Err(PdfError::EncodingError(
381 "Failed to decode text with any supported encoding".to_string(),
382 ))
383 }
384 }
385
386 fn detect_encoding(&self, bytes: &[u8]) -> Option<EncodingType> {
387 let candidates = self.analyze_encoding_indicators(bytes);
388 candidates.first().map(|(encoding, _)| *encoding)
389 }
390
391 fn decode_with_encoding(
392 &self,
393 bytes: &[u8],
394 encoding: EncodingType,
395 lenient: bool,
396 ) -> Result<String, PdfError> {
397 match encoding {
398 EncodingType::Utf8 => {
399 if lenient {
400 Ok(String::from_utf8_lossy(bytes).to_string())
401 } else {
402 String::from_utf8(bytes.to_vec())
403 .map_err(|e| PdfError::EncodingError(format!("UTF-8 decoding failed: {e}")))
404 }
405 }
406
407 EncodingType::Latin1 => {
408 let mut result = String::with_capacity(bytes.len());
409 for &byte in bytes {
410 if byte < 0x80 {
411 result.push(byte as char);
412 } else if let Some(&ch) = self.latin1_map.get(&byte) {
413 result.push(ch);
414 } else if lenient {
415 result.push('\u{FFFD}');
416 } else {
417 return Err(PdfError::EncodingError(format!(
418 "Invalid Latin-1 character: 0x{byte:02X}"
419 )));
420 }
421 }
422 Ok(result)
423 }
424
425 EncodingType::Windows1252 => {
426 let mut result = String::with_capacity(bytes.len());
427 for &byte in bytes {
428 if byte < 0x80 {
429 result.push(byte as char);
430 } else if let Some(&ch) = self.windows1252_map.get(&byte) {
431 result.push(ch);
432 } else if lenient {
433 result.push('\u{FFFD}');
434 } else {
435 return Err(PdfError::EncodingError(format!(
436 "Invalid Windows-1252 character: 0x{byte:02X}"
437 )));
438 }
439 }
440 Ok(result)
441 }
442
443 EncodingType::MacRoman => {
444 let mut result = String::with_capacity(bytes.len());
445 for &byte in bytes {
446 if byte < 0x80 {
447 result.push(byte as char);
448 } else if let Some(&ch) = self.macroman_map.get(&byte) {
449 result.push(ch);
450 } else if lenient {
451 result.push('\u{FFFD}');
452 } else {
453 return Err(PdfError::EncodingError(format!(
454 "Invalid MacRoman character: 0x{byte:02X}"
455 )));
456 }
457 }
458 Ok(result)
459 }
460
461 EncodingType::PdfDocEncoding => {
462 self.decode_with_encoding(bytes, EncodingType::Latin1, lenient)
464 }
465
466 EncodingType::Mixed => {
467 let candidates = [
469 EncodingType::Utf8,
470 EncodingType::Windows1252,
471 EncodingType::Latin1,
472 EncodingType::MacRoman,
473 ];
474
475 for candidate in &candidates {
476 if let Ok(result) = self.decode_with_encoding(bytes, *candidate, true) {
477 let replacement_count = result.chars().filter(|&c| c == '\u{FFFD}').count();
478 if replacement_count < bytes.len() / 4 {
479 return Ok(result);
481 }
482 }
483 }
484
485 Ok(String::from_utf8_lossy(bytes).to_string())
487 }
488 }
489 }
490}
491
492pub fn decode_text(bytes: &[u8]) -> Result<String, PdfError> {
494 let decoder = EnhancedDecoder::new();
495 let options = EncodingOptions::default();
496 let result = decoder.decode(bytes, &options)?;
497 Ok(result.text)
498}
499
500pub fn decode_text_with_encoding(bytes: &[u8], encoding: EncodingType) -> Result<String, PdfError> {
502 let decoder = EnhancedDecoder::new();
503 decoder.decode_with_encoding(bytes, encoding, true)
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn test_utf8_decoding() {
512 let decoder = EnhancedDecoder::new();
513 let options = EncodingOptions::default();
514
515 let utf8_text = "Hello, 世界!";
516 let bytes = utf8_text.as_bytes();
517
518 let result = decoder.decode(bytes, &options).unwrap();
519 assert_eq!(result.text, utf8_text);
520 assert_eq!(result.detected_encoding, Some(EncodingType::Utf8));
521 assert_eq!(result.replacement_count, 0);
522 }
523
524 #[test]
525 fn test_latin1_decoding() {
526 let decoder = EnhancedDecoder::new();
527 let options = EncodingOptions::default();
528
529 let bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2C, 0x20, 0xE9, 0xE8, 0xE7]; let result = decoder.decode(&bytes, &options).unwrap();
533 assert!(result.text.contains("éèç"));
534 }
535
536 #[test]
537 fn test_windows1252_decoding() {
538 let decoder = EnhancedDecoder::new();
539 let options = EncodingOptions::default();
540
541 let bytes = vec![0x80, 0x20, 0x91, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x92]; let result = decoder.decode(&bytes, &options).unwrap();
545 assert!(result.text.contains("€"));
546 assert!(result.text.contains('\u{2018}')); assert!(result.text.contains('\u{2019}')); }
549
550 #[test]
551 fn test_lenient_mode() {
552 let decoder = EnhancedDecoder::new();
553 let mut options = EncodingOptions::default();
554 options.lenient_mode = true;
555 options.preferred_encoding = Some(EncodingType::Utf8); let bytes = vec![0xFF, 0xFE, 0x48, 0x65, 0x6C, 0x6C, 0x6F]; let result = decoder.decode(&bytes, &options).unwrap();
561 assert!(
562 result.replacement_count > 0,
563 "Expected replacement chars, got {}",
564 result.replacement_count
565 );
566 assert!(result.text.contains("Hello"));
567 }
568
569 #[test]
570 fn test_encoding_detection() {
571 let decoder = EnhancedDecoder::new();
572
573 let utf8_bytes = "Hello, 世界!".as_bytes();
575 assert_eq!(
576 decoder.detect_encoding(utf8_bytes),
577 Some(EncodingType::Utf8)
578 );
579
580 let win1252_bytes = vec![0x80, 0x20, 0x48, 0x65, 0x6C, 0x6C, 0x6F];
582 let detected = decoder.detect_encoding(&win1252_bytes);
583 assert!(matches!(
584 detected,
585 Some(EncodingType::Windows1252) | Some(EncodingType::Latin1)
586 ));
587 }
588
589 #[test]
590 fn test_specific_encoding() {
591 let decoder = EnhancedDecoder::new();
592
593 let bytes = vec![0xC9]; let latin1_result = decoder
596 .decode_with_encoding(&bytes, EncodingType::Latin1, false)
597 .unwrap();
598 assert_eq!(latin1_result, "É");
599
600 let win1252_result = decoder
601 .decode_with_encoding(&bytes, EncodingType::Windows1252, false)
602 .unwrap();
603 assert_eq!(win1252_result, "É");
604 }
605
606 #[test]
607 fn test_convenience_functions() {
608 let utf8_text = "Hello, world!";
609 let bytes = utf8_text.as_bytes();
610
611 let decoded = decode_text(bytes).unwrap();
612 assert_eq!(decoded, utf8_text);
613
614 let latin1_bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F, 0xE9]; let decoded = decode_text_with_encoding(&latin1_bytes, EncodingType::Latin1).unwrap();
616 assert!(decoded.contains("é"));
617 }
618}