subx_cli/core/formats/encoding/
converter.rs1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use anyhow::anyhow;
4use encoding_rs::{BIG5, Encoding, GBK, ISO_8859_2, SHIFT_JIS, UTF_8, WINDOWS_1252};
5use std::collections::HashMap;
6
7#[derive(Debug, Clone)]
12pub struct ConversionResult {
13 pub converted_text: String,
15 pub original_encoding: Charset,
17 pub target_encoding: Charset,
19 pub bytes_processed: usize,
21 pub had_errors: bool,
23 pub error_count: usize,
25}
26
27pub struct EncodingConverter {
29 encoding_map: HashMap<Charset, &'static Encoding>,
30}
31
32impl EncodingConverter {
33 pub fn new() -> Self {
35 let mut encoding_map = HashMap::new();
36 encoding_map.insert(Charset::Utf8, UTF_8);
37 encoding_map.insert(Charset::Gbk, GBK);
38 encoding_map.insert(Charset::ShiftJis, SHIFT_JIS);
39 encoding_map.insert(Charset::Big5, BIG5);
40 encoding_map.insert(Charset::Windows1252, WINDOWS_1252);
41 encoding_map.insert(Charset::Iso88591, ISO_8859_2);
42 Self { encoding_map }
43 }
44
45 pub fn convert_to_utf8(
47 &self,
48 data: &[u8],
49 source_encoding: &Charset,
50 ) -> Result<ConversionResult> {
51 if *source_encoding == Charset::Utf8 {
52 return Ok(ConversionResult {
53 converted_text: String::from_utf8_lossy(data).to_string(),
54 original_encoding: Charset::Utf8,
55 target_encoding: Charset::Utf8,
56 bytes_processed: data.len(),
57 had_errors: false,
58 error_count: 0,
59 });
60 }
61 let encoding = self
62 .encoding_map
63 .get(source_encoding)
64 .ok_or_else(|| anyhow!("Unsupported encoding: {:?}", source_encoding))?;
65 let (converted, _, had_errors) = encoding.decode(data);
66 let error_count = if had_errors {
67 self.count_replacement_chars(&converted)
68 } else {
69 0
70 };
71 Ok(ConversionResult {
72 converted_text: converted.into_owned(),
73 original_encoding: source_encoding.clone(),
74 target_encoding: Charset::Utf8,
75 bytes_processed: data.len(),
76 had_errors,
77 error_count,
78 })
79 }
80
81 pub fn convert_file_to_utf8(
83 &self,
84 file_path: &str,
85 encoding_info: &EncodingInfo,
86 ) -> Result<ConversionResult> {
87 let data = std::fs::read(file_path)?;
88 let slice = if encoding_info.bom_detected {
89 self.skip_bom(&data, &encoding_info.charset)
90 } else {
91 data.as_slice()
92 };
93 self.convert_to_utf8(slice, &encoding_info.charset)
94 }
95
96 fn skip_bom<'a>(&self, data: &'a [u8], charset: &Charset) -> &'a [u8] {
97 match charset {
98 Charset::Utf8 if data.starts_with(&[0xEF, 0xBB, 0xBF]) => &data[3..],
99 Charset::Utf16Le if data.starts_with(&[0xFF, 0xFE]) => &data[2..],
100 Charset::Utf16Be if data.starts_with(&[0xFE, 0xFF]) => &data[2..],
101 Charset::Utf32Le if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) => &data[4..],
102 Charset::Utf32Be if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) => &data[4..],
103 _ => data,
104 }
105 }
106
107 fn count_replacement_chars(&self, text: &str) -> usize {
108 text.chars().filter(|&c| c == '\u{FFFD}').count()
109 }
110
111 pub fn validate_conversion(&self, result: &ConversionResult) -> ValidationResult {
113 ValidationResult {
114 is_valid: !result.had_errors || result.error_count == 0,
115 confidence: if result.had_errors {
116 1.0 - result.error_count as f32 / result.converted_text.len() as f32
117 } else {
118 1.0
119 },
120 warnings: self.generate_warnings(result),
121 }
122 }
123
124 fn generate_warnings(&self, result: &ConversionResult) -> Vec<String> {
125 let mut warnings = Vec::new();
126 if result.had_errors {
127 warnings.push(format!(
128 "Encoding conversion had {} replacement characters",
129 result.error_count
130 ));
131 }
132 if result.error_count > result.bytes_processed / 10 {
133 warnings.push("High error rate detected - encoding may be incorrect".to_string());
134 }
135 warnings
136 }
137}
138
139#[derive(Debug, Clone)]
144pub struct ValidationResult {
145 pub is_valid: bool,
147 pub confidence: f32,
149 pub warnings: Vec<String>,
151}
152
153impl Default for EncodingConverter {
154 fn default() -> Self {
155 Self::new()
156 }
157}