Skip to main content

subx_cli/core/formats/encoding/
converter.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use anyhow::anyhow;
4use encoding_rs::{BIG5, Encoding, GBK, ISO_8859_2, SHIFT_JIS, UTF_8, WINDOWS_1252};
5use std::collections::HashMap;
6
7/// Result of an encoding conversion operation.
8///
9/// Contains the converted text along with metadata about the conversion
10/// process, including error information and encoding details.
11#[derive(Debug, Clone)]
12pub struct ConversionResult {
13    /// The converted text in the target encoding
14    pub converted_text: String,
15    /// The original character encoding that was detected
16    pub original_encoding: Charset,
17    /// The target encoding for conversion
18    pub target_encoding: Charset,
19    /// Number of bytes processed during conversion
20    pub bytes_processed: usize,
21    /// Whether any errors occurred during conversion
22    pub had_errors: bool,
23    /// Total number of conversion errors encountered
24    pub error_count: usize,
25}
26
27/// Encoding converter
28pub struct EncodingConverter {
29    encoding_map: HashMap<Charset, &'static Encoding>,
30}
31
32impl EncodingConverter {
33    /// Create converter and initialize encoding mapping
34    pub fn new() -> Self {
35        let mut encoding_map = HashMap::new();
36        encoding_map.insert(Charset::Utf8, UTF_8);
37        encoding_map.insert(Charset::Gbk, GBK);
38        encoding_map.insert(Charset::ShiftJis, SHIFT_JIS);
39        encoding_map.insert(Charset::Big5, BIG5);
40        encoding_map.insert(Charset::Windows1252, WINDOWS_1252);
41        encoding_map.insert(Charset::Iso88591, ISO_8859_2);
42        Self { encoding_map }
43    }
44
45    /// Convert data to UTF-8
46    pub fn convert_to_utf8(
47        &self,
48        data: &[u8],
49        source_encoding: &Charset,
50    ) -> Result<ConversionResult> {
51        if *source_encoding == Charset::Utf8 {
52            return Ok(ConversionResult {
53                converted_text: String::from_utf8_lossy(data).to_string(),
54                original_encoding: Charset::Utf8,
55                target_encoding: Charset::Utf8,
56                bytes_processed: data.len(),
57                had_errors: false,
58                error_count: 0,
59            });
60        }
61        let encoding = self
62            .encoding_map
63            .get(source_encoding)
64            .ok_or_else(|| anyhow!("Unsupported encoding: {:?}", source_encoding))?;
65        let (converted, _, had_errors) = encoding.decode(data);
66        let error_count = if had_errors {
67            self.count_replacement_chars(&converted)
68        } else {
69            0
70        };
71        Ok(ConversionResult {
72            converted_text: converted.into_owned(),
73            original_encoding: source_encoding.clone(),
74            target_encoding: Charset::Utf8,
75            bytes_processed: data.len(),
76            had_errors,
77            error_count,
78        })
79    }
80
81    /// Convert file content to UTF-8
82    pub fn convert_file_to_utf8(
83        &self,
84        file_path: &str,
85        encoding_info: &EncodingInfo,
86    ) -> Result<ConversionResult> {
87        crate::core::fs_util::check_file_size(
88            std::path::Path::new(file_path),
89            52_428_800,
90            "Subtitle",
91        )?;
92        let data = std::fs::read(file_path)?;
93        let slice = if encoding_info.bom_detected {
94            self.skip_bom(&data, &encoding_info.charset)
95        } else {
96            data.as_slice()
97        };
98        self.convert_to_utf8(slice, &encoding_info.charset)
99    }
100
101    fn skip_bom<'a>(&self, data: &'a [u8], charset: &Charset) -> &'a [u8] {
102        match charset {
103            Charset::Utf8 if data.starts_with(&[0xEF, 0xBB, 0xBF]) => &data[3..],
104            Charset::Utf16Le if data.starts_with(&[0xFF, 0xFE]) => &data[2..],
105            Charset::Utf16Be if data.starts_with(&[0xFE, 0xFF]) => &data[2..],
106            Charset::Utf32Le if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) => &data[4..],
107            Charset::Utf32Be if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) => &data[4..],
108            _ => data,
109        }
110    }
111
112    fn count_replacement_chars(&self, text: &str) -> usize {
113        text.chars().filter(|&c| c == '\u{FFFD}').count()
114    }
115
116    /// Validate conversion result
117    pub fn validate_conversion(&self, result: &ConversionResult) -> ValidationResult {
118        ValidationResult {
119            is_valid: !result.had_errors || result.error_count == 0,
120            confidence: if result.had_errors {
121                1.0 - result.error_count as f32 / result.converted_text.len() as f32
122            } else {
123                1.0
124            },
125            warnings: self.generate_warnings(result),
126        }
127    }
128
129    fn generate_warnings(&self, result: &ConversionResult) -> Vec<String> {
130        let mut warnings = Vec::new();
131        if result.had_errors {
132            warnings.push(format!(
133                "Encoding conversion had {} replacement characters",
134                result.error_count
135            ));
136        }
137        if result.error_count > result.bytes_processed / 10 {
138            warnings.push("High error rate detected - encoding may be incorrect".to_string());
139        }
140        warnings
141    }
142}
143
144/// Result of encoding validation process.
145///
146/// Contains validation status, confidence level, and any warnings
147/// about potential encoding issues.
148#[derive(Debug, Clone)]
149pub struct ValidationResult {
150    /// Whether the encoding validation passed
151    pub is_valid: bool,
152    /// Confidence level in the validation result (0.0 to 1.0)
153    pub confidence: f32,
154    /// List of validation warnings
155    pub warnings: Vec<String>,
156}
157
158impl Default for EncodingConverter {
159    fn default() -> Self {
160        Self::new()
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167    use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
168    use std::fs;
169    use tempfile::TempDir;
170
171    fn make_converter() -> EncodingConverter {
172        EncodingConverter::new()
173    }
174
175    fn make_encoding_info(charset: Charset, bom_detected: bool) -> EncodingInfo {
176        EncodingInfo {
177            charset,
178            confidence: 1.0,
179            bom_detected,
180            sample_text: String::new(),
181        }
182    }
183
184    // --- convert_to_utf8: UTF-8 passthrough ---
185
186    #[test]
187    fn test_convert_to_utf8_utf8_passthrough_ascii() {
188        let converter = make_converter();
189        let text = "Hello, World!";
190        let result = converter
191            .convert_to_utf8(text.as_bytes(), &Charset::Utf8)
192            .unwrap();
193        assert_eq!(result.converted_text, text);
194        assert_eq!(result.original_encoding, Charset::Utf8);
195        assert_eq!(result.target_encoding, Charset::Utf8);
196        assert_eq!(result.bytes_processed, text.len());
197        assert!(!result.had_errors);
198        assert_eq!(result.error_count, 0);
199    }
200
201    #[test]
202    fn test_convert_to_utf8_utf8_passthrough_multibyte() {
203        let converter = make_converter();
204        let text = "測試文字 🌍";
205        let result = converter
206            .convert_to_utf8(text.as_bytes(), &Charset::Utf8)
207            .unwrap();
208        assert_eq!(result.converted_text, text);
209        assert_eq!(result.original_encoding, Charset::Utf8);
210        assert_eq!(result.bytes_processed, text.as_bytes().len());
211        assert!(!result.had_errors);
212    }
213
214    #[test]
215    fn test_convert_to_utf8_utf8_empty_bytes() {
216        let converter = make_converter();
217        let result = converter.convert_to_utf8(&[], &Charset::Utf8).unwrap();
218        assert_eq!(result.converted_text, "");
219        assert_eq!(result.bytes_processed, 0);
220        assert!(!result.had_errors);
221        assert_eq!(result.error_count, 0);
222    }
223
224    // --- convert_to_utf8: non-UTF-8 encodings ---
225
226    #[test]
227    fn test_convert_to_utf8_gbk() {
228        let converter = make_converter();
229        // "你好" in GBK: 你=0xC4E3, 好=0xBAC3
230        let gbk_bytes = vec![0xC4u8, 0xE3, 0xBA, 0xC3];
231        let result = converter
232            .convert_to_utf8(&gbk_bytes, &Charset::Gbk)
233            .unwrap();
234        assert_eq!(result.original_encoding, Charset::Gbk);
235        assert_eq!(result.target_encoding, Charset::Utf8);
236        assert_eq!(result.bytes_processed, gbk_bytes.len());
237        assert!(!result.had_errors);
238        assert!(result.converted_text.contains('你'));
239    }
240
241    #[test]
242    fn test_convert_to_utf8_gbk_empty() {
243        let converter = make_converter();
244        let result = converter.convert_to_utf8(&[], &Charset::Gbk).unwrap();
245        assert_eq!(result.converted_text, "");
246        assert_eq!(result.bytes_processed, 0);
247        assert!(!result.had_errors);
248        assert_eq!(result.error_count, 0);
249    }
250
251    #[test]
252    fn test_convert_to_utf8_windows1252() {
253        let converter = make_converter();
254        // "café" — 'é' is 0xE9 in Windows-1252
255        let bytes = vec![b'c', b'a', b'f', 0xE9u8];
256        let result = converter
257            .convert_to_utf8(&bytes, &Charset::Windows1252)
258            .unwrap();
259        assert_eq!(result.original_encoding, Charset::Windows1252);
260        assert_eq!(result.target_encoding, Charset::Utf8);
261        assert_eq!(result.bytes_processed, bytes.len());
262        assert!(result.converted_text.contains('é') || result.converted_text.contains('é'));
263        assert!(!result.had_errors);
264    }
265
266    #[test]
267    fn test_convert_to_utf8_shiftjis() {
268        let converter = make_converter();
269        // "テスト" (katakana) in ShiftJIS: 0x83,0x65,0x83,0x58,0x83,0x67
270        let shiftjis_bytes = vec![0x83u8, 0x65, 0x83, 0x58, 0x83, 0x67];
271        let result = converter
272            .convert_to_utf8(&shiftjis_bytes, &Charset::ShiftJis)
273            .unwrap();
274        assert_eq!(result.original_encoding, Charset::ShiftJis);
275        assert_eq!(result.target_encoding, Charset::Utf8);
276        assert_eq!(result.bytes_processed, shiftjis_bytes.len());
277        assert!(!result.converted_text.is_empty());
278    }
279
280    #[test]
281    fn test_convert_to_utf8_big5() {
282        let converter = make_converter();
283        // "你好" in Big5: 你=0xA741, 好=0xA66E (approximate)
284        let big5_bytes = vec![0xA7u8, 0x41, 0xA6, 0x6E];
285        let result = converter
286            .convert_to_utf8(&big5_bytes, &Charset::Big5)
287            .unwrap();
288        assert_eq!(result.original_encoding, Charset::Big5);
289        assert_eq!(result.target_encoding, Charset::Utf8);
290        assert_eq!(result.bytes_processed, big5_bytes.len());
291        assert!(!result.converted_text.is_empty());
292    }
293
294    #[test]
295    fn test_convert_to_utf8_iso88591() {
296        let converter = make_converter();
297        // Latin characters with accents valid in ISO-8859-2
298        let bytes = vec![b'H', b'e', b'l', b'l', b'o', 0xE0u8]; // 'à' in ISO-8859-2
299        let result = converter
300            .convert_to_utf8(&bytes, &Charset::Iso88591)
301            .unwrap();
302        assert_eq!(result.original_encoding, Charset::Iso88591);
303        assert_eq!(result.target_encoding, Charset::Utf8);
304        assert_eq!(result.bytes_processed, bytes.len());
305        assert!(!result.converted_text.is_empty());
306    }
307
308    // --- convert_to_utf8: unsupported/error paths ---
309
310    #[test]
311    fn test_convert_to_utf8_unknown_returns_error() {
312        let converter = make_converter();
313        let result = converter.convert_to_utf8(b"some data", &Charset::Unknown);
314        assert!(result.is_err());
315        assert!(
316            result
317                .unwrap_err()
318                .to_string()
319                .contains("Unsupported encoding")
320        );
321    }
322
323    #[test]
324    fn test_convert_to_utf8_utf16le_returns_error() {
325        let converter = make_converter();
326        let result = converter.convert_to_utf8(b"data", &Charset::Utf16Le);
327        assert!(result.is_err());
328    }
329
330    #[test]
331    fn test_convert_to_utf8_utf16be_returns_error() {
332        let converter = make_converter();
333        let result = converter.convert_to_utf8(b"data", &Charset::Utf16Be);
334        assert!(result.is_err());
335    }
336
337    #[test]
338    fn test_convert_to_utf8_utf32le_returns_error() {
339        let converter = make_converter();
340        let result = converter.convert_to_utf8(b"data", &Charset::Utf32Le);
341        assert!(result.is_err());
342    }
343
344    #[test]
345    fn test_convert_to_utf8_utf32be_returns_error() {
346        let converter = make_converter();
347        let result = converter.convert_to_utf8(b"data", &Charset::Utf32Be);
348        assert!(result.is_err());
349    }
350
351    #[test]
352    fn test_convert_to_utf8_euckr_returns_error() {
353        let converter = make_converter();
354        let result = converter.convert_to_utf8(b"data", &Charset::Euckr);
355        assert!(result.is_err());
356        assert!(
357            result
358                .unwrap_err()
359                .to_string()
360                .contains("Unsupported encoding")
361        );
362    }
363
364    // --- count_replacement_chars exercised through had_errors path ---
365
366    #[test]
367    fn test_convert_to_utf8_invalid_gbk_triggers_replacement_chars() {
368        let converter = make_converter();
369        // 0x81 starts a 2-byte GBK sequence; 0x20 (space) is not a valid second byte
370        let invalid_gbk = vec![0x81u8, 0x20, 0x81, 0x20];
371        let result = converter
372            .convert_to_utf8(&invalid_gbk, &Charset::Gbk)
373            .unwrap();
374        if result.had_errors {
375            assert!(result.error_count > 0);
376        }
377    }
378
379    // --- convert_file_to_utf8 ---
380
381    #[test]
382    fn test_convert_file_to_utf8_utf8_no_bom() {
383        let converter = make_converter();
384        let dir = TempDir::new().unwrap();
385        let path = dir.path().join("test.txt");
386        let content = "Hello, 世界!";
387        fs::write(&path, content).unwrap();
388        let info = make_encoding_info(Charset::Utf8, false);
389        let result = converter
390            .convert_file_to_utf8(path.to_str().unwrap(), &info)
391            .unwrap();
392        assert_eq!(result.converted_text, content);
393        assert!(!result.had_errors);
394    }
395
396    #[test]
397    fn test_convert_file_to_utf8_nonexistent_file() {
398        let converter = make_converter();
399        let info = make_encoding_info(Charset::Utf8, false);
400        let result = converter.convert_file_to_utf8("/nonexistent/path/does_not_exist.txt", &info);
401        assert!(result.is_err());
402    }
403
404    #[test]
405    fn test_convert_file_to_utf8_gbk_no_bom() {
406        let converter = make_converter();
407        let dir = TempDir::new().unwrap();
408        let path = dir.path().join("gbk.txt");
409        // "你好" in GBK
410        let gbk_bytes = vec![0xC4u8, 0xE3, 0xBA, 0xC3];
411        fs::write(&path, &gbk_bytes).unwrap();
412        let info = make_encoding_info(Charset::Gbk, false);
413        let result = converter
414            .convert_file_to_utf8(path.to_str().unwrap(), &info)
415            .unwrap();
416        assert_eq!(result.original_encoding, Charset::Gbk);
417        assert!(result.converted_text.contains('你'));
418    }
419
420    // --- BOM handling via convert_file_to_utf8 / skip_bom ---
421
422    #[test]
423    fn test_convert_file_to_utf8_utf8_with_bom_stripped() {
424        let converter = make_converter();
425        let dir = TempDir::new().unwrap();
426        let path = dir.path().join("bom_utf8.txt");
427        let content = "Hello, World!";
428        let mut data = vec![0xEFu8, 0xBB, 0xBF]; // UTF-8 BOM
429        data.extend_from_slice(content.as_bytes());
430        fs::write(&path, &data).unwrap();
431        let info = make_encoding_info(Charset::Utf8, true);
432        let result = converter
433            .convert_file_to_utf8(path.to_str().unwrap(), &info)
434            .unwrap();
435        // BOM must be stripped; converted text should equal original content
436        assert_eq!(result.converted_text, content);
437        assert!(!result.had_errors);
438    }
439
440    #[test]
441    fn test_skip_bom_utf16le_exercised_then_fails() {
442        let converter = make_converter();
443        let dir = TempDir::new().unwrap();
444        let path = dir.path().join("utf16le.bin");
445        let mut data = vec![0xFFu8, 0xFE]; // UTF-16 LE BOM
446        data.extend_from_slice(b"H\x00i\x00");
447        fs::write(&path, &data).unwrap();
448        // skip_bom strips 2 bytes; convert_to_utf8 with Utf16Le then fails
449        let info = make_encoding_info(Charset::Utf16Le, true);
450        let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
451        assert!(result.is_err());
452    }
453
454    #[test]
455    fn test_skip_bom_utf16be_exercised_then_fails() {
456        let converter = make_converter();
457        let dir = TempDir::new().unwrap();
458        let path = dir.path().join("utf16be.bin");
459        let mut data = vec![0xFEu8, 0xFF]; // UTF-16 BE BOM
460        data.extend_from_slice(b"\x00H\x00i");
461        fs::write(&path, &data).unwrap();
462        let info = make_encoding_info(Charset::Utf16Be, true);
463        let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
464        assert!(result.is_err());
465    }
466
467    #[test]
468    fn test_skip_bom_utf32le_exercised_then_fails() {
469        let converter = make_converter();
470        let dir = TempDir::new().unwrap();
471        let path = dir.path().join("utf32le.bin");
472        let mut data = vec![0xFFu8, 0xFE, 0x00, 0x00]; // UTF-32 LE BOM
473        data.extend_from_slice(b"H\x00\x00\x00");
474        fs::write(&path, &data).unwrap();
475        let info = make_encoding_info(Charset::Utf32Le, true);
476        let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
477        assert!(result.is_err());
478    }
479
480    #[test]
481    fn test_skip_bom_utf32be_exercised_then_fails() {
482        let converter = make_converter();
483        let dir = TempDir::new().unwrap();
484        let path = dir.path().join("utf32be.bin");
485        let mut data = vec![0x00u8, 0x00, 0xFE, 0xFF]; // UTF-32 BE BOM
486        data.extend_from_slice(b"\x00\x00\x00H");
487        fs::write(&path, &data).unwrap();
488        let info = make_encoding_info(Charset::Utf32Be, true);
489        let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
490        assert!(result.is_err());
491    }
492
493    #[test]
494    fn test_skip_bom_fallthrough_mismatched_bom_flag() {
495        // bom_detected=true but charset=Gbk → hits the `_ => data` arm in skip_bom
496        let converter = make_converter();
497        let dir = TempDir::new().unwrap();
498        let path = dir.path().join("gbk_no_bom.txt");
499        let gbk_bytes = vec![0xC4u8, 0xE3, 0xBA, 0xC3];
500        fs::write(&path, &gbk_bytes).unwrap();
501        let info = make_encoding_info(Charset::Gbk, true);
502        let result = converter
503            .convert_file_to_utf8(path.to_str().unwrap(), &info)
504            .unwrap();
505        assert!(result.converted_text.contains('你'));
506    }
507
508    #[test]
509    fn test_skip_bom_utf8_charset_but_no_bom_bytes() {
510        // bom_detected=true, Charset::Utf8, but file has no BOM → hits `_ => data`
511        let converter = make_converter();
512        let dir = TempDir::new().unwrap();
513        let path = dir.path().join("utf8_nobom.txt");
514        let content = "Just text";
515        fs::write(&path, content).unwrap();
516        let info = make_encoding_info(Charset::Utf8, true);
517        let result = converter
518            .convert_file_to_utf8(path.to_str().unwrap(), &info)
519            .unwrap();
520        assert_eq!(result.converted_text, content);
521    }
522
523    // --- validate_conversion ---
524
525    #[test]
526    fn test_validate_conversion_no_errors() {
527        let converter = make_converter();
528        let result = ConversionResult {
529            converted_text: "Hello World".to_string(),
530            original_encoding: Charset::Utf8,
531            target_encoding: Charset::Utf8,
532            bytes_processed: 11,
533            had_errors: false,
534            error_count: 0,
535        };
536        let validation = converter.validate_conversion(&result);
537        assert!(validation.is_valid);
538        assert_eq!(validation.confidence, 1.0);
539        assert!(validation.warnings.is_empty());
540    }
541
542    #[test]
543    fn test_validate_conversion_had_errors_zero_count_still_valid() {
544        // had_errors=true but error_count=0 → is_valid = !true || true = true
545        let converter = make_converter();
546        let result = ConversionResult {
547            converted_text: "Hello World".to_string(),
548            original_encoding: Charset::Gbk,
549            target_encoding: Charset::Utf8,
550            bytes_processed: 11,
551            had_errors: true,
552            error_count: 0,
553        };
554        let validation = converter.validate_conversion(&result);
555        assert!(validation.is_valid);
556        // confidence = 1.0 - 0/11 = 1.0
557        assert_eq!(validation.confidence, 1.0);
558        // had_errors=true → warning about replacement chars
559        assert_eq!(validation.warnings.len(), 1);
560        assert!(validation.warnings[0].contains("replacement characters"));
561    }
562
563    #[test]
564    fn test_validate_conversion_with_replacement_errors() {
565        let converter = make_converter();
566        let result = ConversionResult {
567            converted_text: "Hello\u{FFFD}World".to_string(),
568            original_encoding: Charset::Windows1252,
569            target_encoding: Charset::Utf8,
570            bytes_processed: 11,
571            had_errors: true,
572            error_count: 1,
573        };
574        let validation = converter.validate_conversion(&result);
575        // is_valid = !true || (1 == 0) = false
576        assert!(!validation.is_valid);
577        assert!(validation.confidence < 1.0);
578        assert!(!validation.warnings.is_empty());
579        assert!(validation.warnings[0].contains("replacement characters"));
580    }
581
582    #[test]
583    fn test_validate_conversion_high_error_rate_warning() {
584        let converter = make_converter();
585        // error_count=3 > bytes_processed(10) / 10 = 1 → second warning
586        let result = ConversionResult {
587            converted_text: "\u{FFFD}\u{FFFD}\u{FFFD}AB".to_string(),
588            original_encoding: Charset::ShiftJis,
589            target_encoding: Charset::Utf8,
590            bytes_processed: 10,
591            had_errors: true,
592            error_count: 3,
593        };
594        let validation = converter.validate_conversion(&result);
595        assert!(!validation.is_valid);
596        assert!(validation.warnings.len() >= 2);
597        assert!(
598            validation
599                .warnings
600                .iter()
601                .any(|w| w.contains("High error rate"))
602        );
603    }
604
605    // --- Default impl ---
606
607    #[test]
608    fn test_encoding_converter_default_works() {
609        let converter = EncodingConverter::default();
610        let result = converter.convert_to_utf8(b"hello", &Charset::Utf8).unwrap();
611        assert_eq!(result.converted_text, "hello");
612    }
613}