subx_cli/core/formats/encoding/
converter.rs1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use anyhow::anyhow;
4use encoding_rs::{BIG5, Encoding, GBK, ISO_8859_2, SHIFT_JIS, UTF_8, WINDOWS_1252};
5use std::collections::HashMap;
6
7#[derive(Debug, Clone)]
12pub struct ConversionResult {
13 pub converted_text: String,
15 pub original_encoding: Charset,
17 pub target_encoding: Charset,
19 pub bytes_processed: usize,
21 pub had_errors: bool,
23 pub error_count: usize,
25}
26
27pub struct EncodingConverter {
29 encoding_map: HashMap<Charset, &'static Encoding>,
30}
31
32impl EncodingConverter {
33 pub fn new() -> Self {
35 let mut encoding_map = HashMap::new();
36 encoding_map.insert(Charset::Utf8, UTF_8);
37 encoding_map.insert(Charset::Gbk, GBK);
38 encoding_map.insert(Charset::ShiftJis, SHIFT_JIS);
39 encoding_map.insert(Charset::Big5, BIG5);
40 encoding_map.insert(Charset::Windows1252, WINDOWS_1252);
41 encoding_map.insert(Charset::Iso88591, ISO_8859_2);
42 Self { encoding_map }
43 }
44
45 pub fn convert_to_utf8(
47 &self,
48 data: &[u8],
49 source_encoding: &Charset,
50 ) -> Result<ConversionResult> {
51 if *source_encoding == Charset::Utf8 {
52 return Ok(ConversionResult {
53 converted_text: String::from_utf8_lossy(data).to_string(),
54 original_encoding: Charset::Utf8,
55 target_encoding: Charset::Utf8,
56 bytes_processed: data.len(),
57 had_errors: false,
58 error_count: 0,
59 });
60 }
61 let encoding = self
62 .encoding_map
63 .get(source_encoding)
64 .ok_or_else(|| anyhow!("Unsupported encoding: {:?}", source_encoding))?;
65 let (converted, _, had_errors) = encoding.decode(data);
66 let error_count = if had_errors {
67 self.count_replacement_chars(&converted)
68 } else {
69 0
70 };
71 Ok(ConversionResult {
72 converted_text: converted.into_owned(),
73 original_encoding: source_encoding.clone(),
74 target_encoding: Charset::Utf8,
75 bytes_processed: data.len(),
76 had_errors,
77 error_count,
78 })
79 }
80
81 pub fn convert_file_to_utf8(
83 &self,
84 file_path: &str,
85 encoding_info: &EncodingInfo,
86 ) -> Result<ConversionResult> {
87 crate::core::fs_util::check_file_size(
88 std::path::Path::new(file_path),
89 52_428_800,
90 "Subtitle",
91 )?;
92 let data = std::fs::read(file_path)?;
93 let slice = if encoding_info.bom_detected {
94 self.skip_bom(&data, &encoding_info.charset)
95 } else {
96 data.as_slice()
97 };
98 self.convert_to_utf8(slice, &encoding_info.charset)
99 }
100
101 fn skip_bom<'a>(&self, data: &'a [u8], charset: &Charset) -> &'a [u8] {
102 match charset {
103 Charset::Utf8 if data.starts_with(&[0xEF, 0xBB, 0xBF]) => &data[3..],
104 Charset::Utf16Le if data.starts_with(&[0xFF, 0xFE]) => &data[2..],
105 Charset::Utf16Be if data.starts_with(&[0xFE, 0xFF]) => &data[2..],
106 Charset::Utf32Le if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) => &data[4..],
107 Charset::Utf32Be if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) => &data[4..],
108 _ => data,
109 }
110 }
111
112 fn count_replacement_chars(&self, text: &str) -> usize {
113 text.chars().filter(|&c| c == '\u{FFFD}').count()
114 }
115
116 pub fn validate_conversion(&self, result: &ConversionResult) -> ValidationResult {
118 ValidationResult {
119 is_valid: !result.had_errors || result.error_count == 0,
120 confidence: if result.had_errors {
121 1.0 - result.error_count as f32 / result.converted_text.len() as f32
122 } else {
123 1.0
124 },
125 warnings: self.generate_warnings(result),
126 }
127 }
128
129 fn generate_warnings(&self, result: &ConversionResult) -> Vec<String> {
130 let mut warnings = Vec::new();
131 if result.had_errors {
132 warnings.push(format!(
133 "Encoding conversion had {} replacement characters",
134 result.error_count
135 ));
136 }
137 if result.error_count > result.bytes_processed / 10 {
138 warnings.push("High error rate detected - encoding may be incorrect".to_string());
139 }
140 warnings
141 }
142}
143
144#[derive(Debug, Clone)]
149pub struct ValidationResult {
150 pub is_valid: bool,
152 pub confidence: f32,
154 pub warnings: Vec<String>,
156}
157
158impl Default for EncodingConverter {
159 fn default() -> Self {
160 Self::new()
161 }
162}