subx_cli/core/formats/encoding/
converter.rs1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use anyhow::anyhow;
4use encoding_rs::{BIG5, Encoding, GBK, ISO_8859_2, SHIFT_JIS, UTF_8, WINDOWS_1252};
5use std::collections::HashMap;
6
7#[derive(Debug, Clone)]
9pub struct ConversionResult {
10 pub converted_text: String,
11 pub original_encoding: Charset,
12 pub target_encoding: Charset,
13 pub bytes_processed: usize,
14 pub had_errors: bool,
15 pub error_count: usize,
16}
17
18pub struct EncodingConverter {
20 encoding_map: HashMap<Charset, &'static Encoding>,
21}
22
23impl EncodingConverter {
24 pub fn new() -> Self {
26 let mut encoding_map = HashMap::new();
27 encoding_map.insert(Charset::Utf8, UTF_8);
28 encoding_map.insert(Charset::Gbk, GBK);
29 encoding_map.insert(Charset::ShiftJis, SHIFT_JIS);
30 encoding_map.insert(Charset::Big5, BIG5);
31 encoding_map.insert(Charset::Windows1252, WINDOWS_1252);
32 encoding_map.insert(Charset::Iso88591, ISO_8859_2);
33 Self { encoding_map }
34 }
35
36 pub fn convert_to_utf8(
38 &self,
39 data: &[u8],
40 source_encoding: &Charset,
41 ) -> Result<ConversionResult> {
42 if *source_encoding == Charset::Utf8 {
43 return Ok(ConversionResult {
44 converted_text: String::from_utf8_lossy(data).to_string(),
45 original_encoding: Charset::Utf8,
46 target_encoding: Charset::Utf8,
47 bytes_processed: data.len(),
48 had_errors: false,
49 error_count: 0,
50 });
51 }
52 let encoding = self
53 .encoding_map
54 .get(source_encoding)
55 .ok_or_else(|| anyhow!("Unsupported encoding: {:?}", source_encoding))?;
56 let (converted, _, had_errors) = encoding.decode(data);
57 let error_count = if had_errors {
58 self.count_replacement_chars(&converted)
59 } else {
60 0
61 };
62 Ok(ConversionResult {
63 converted_text: converted.into_owned(),
64 original_encoding: source_encoding.clone(),
65 target_encoding: Charset::Utf8,
66 bytes_processed: data.len(),
67 had_errors,
68 error_count,
69 })
70 }
71
72 pub fn convert_file_to_utf8(
74 &self,
75 file_path: &str,
76 encoding_info: &EncodingInfo,
77 ) -> Result<ConversionResult> {
78 let data = std::fs::read(file_path)?;
79 let slice = if encoding_info.bom_detected {
80 self.skip_bom(&data, &encoding_info.charset)
81 } else {
82 data.as_slice()
83 };
84 self.convert_to_utf8(slice, &encoding_info.charset)
85 }
86
87 fn skip_bom<'a>(&self, data: &'a [u8], charset: &Charset) -> &'a [u8] {
88 match charset {
89 Charset::Utf8 if data.starts_with(&[0xEF, 0xBB, 0xBF]) => &data[3..],
90 Charset::Utf16Le if data.starts_with(&[0xFF, 0xFE]) => &data[2..],
91 Charset::Utf16Be if data.starts_with(&[0xFE, 0xFF]) => &data[2..],
92 Charset::Utf32Le if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) => &data[4..],
93 Charset::Utf32Be if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) => &data[4..],
94 _ => data,
95 }
96 }
97
98 fn count_replacement_chars(&self, text: &str) -> usize {
99 text.chars().filter(|&c| c == '\u{FFFD}').count()
100 }
101
102 pub fn validate_conversion(&self, result: &ConversionResult) -> ValidationResult {
104 ValidationResult {
105 is_valid: !result.had_errors || result.error_count == 0,
106 confidence: if result.had_errors {
107 1.0 - result.error_count as f32 / result.converted_text.len() as f32
108 } else {
109 1.0
110 },
111 warnings: self.generate_warnings(result),
112 }
113 }
114
115 fn generate_warnings(&self, result: &ConversionResult) -> Vec<String> {
116 let mut warnings = Vec::new();
117 if result.had_errors {
118 warnings.push(format!(
119 "Encoding conversion had {} replacement characters",
120 result.error_count
121 ));
122 }
123 if result.error_count > result.bytes_processed / 10 {
124 warnings.push("High error rate detected - encoding may be incorrect".to_string());
125 }
126 warnings
127 }
128}
129
130#[derive(Debug, Clone)]
132pub struct ValidationResult {
133 pub is_valid: bool,
134 pub confidence: f32,
135 pub warnings: Vec<String>,
136}
137
138impl Default for EncodingConverter {
139 fn default() -> Self {
140 Self::new()
141 }
142}