subx_cli/core/formats/encoding/
detector.rs1use crate::Result;
2use crate::config::load_config;
3use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
4use std::fs::File;
5use std::io::Read;
6
7pub struct EncodingDetector {
9 confidence_threshold: f32,
10 max_sample_size: usize,
11 supported_charsets: Vec<Charset>,
12}
13
14impl EncodingDetector {
15 pub fn new() -> Result<Self> {
17 let config = load_config()?;
18 Ok(Self {
19 confidence_threshold: config.formats.encoding_detection_confidence,
20 max_sample_size: 8192,
21 supported_charsets: Self::default_charsets(),
22 })
23 }
24
25 pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
27 let mut file = File::open(file_path)?;
28 let mut buffer = vec![0; self.max_sample_size];
29 let bytes_read = file.read(&mut buffer)?;
30 buffer.truncate(bytes_read);
31 self.detect_encoding(&buffer)
32 }
33
34 pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
36 if let Some(encoding) = self.detect_bom(data) {
37 return Ok(encoding);
38 }
39 let candidates = self.analyze_byte_patterns(data)?;
40 self.select_best_encoding(candidates, data)
41 }
42
43 fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
44 if data.len() < 3 {
45 return None;
46 }
47 match &data[0..3] {
48 [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
49 charset: Charset::Utf8,
50 confidence: 1.0,
51 bom_detected: true,
52 sample_text: String::from("UTF-8 with BOM"),
53 }),
54 [0xFF, 0xFE, ..] => Some(EncodingInfo {
55 charset: Charset::Utf16Le,
56 confidence: 1.0,
57 bom_detected: true,
58 sample_text: String::from("UTF-16 LE with BOM"),
59 }),
60 [0xFE, 0xFF, ..] => Some(EncodingInfo {
61 charset: Charset::Utf16Be,
62 confidence: 1.0,
63 bom_detected: true,
64 sample_text: String::from("UTF-16 BE with BOM"),
65 }),
66 _ => {
67 if data.len() >= 4 {
68 match &data[0..4] {
69 [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
70 charset: Charset::Utf32Le,
71 confidence: 1.0,
72 bom_detected: true,
73 sample_text: String::from("UTF-32 LE with BOM"),
74 }),
75 [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
76 charset: Charset::Utf32Be,
77 confidence: 1.0,
78 bom_detected: true,
79 sample_text: String::from("UTF-32 BE with BOM"),
80 }),
81 _ => None,
82 }
83 } else {
84 None
85 }
86 }
87 }
88 }
89
90 fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
91 let mut candidates = Vec::new();
92 for charset in &self.supported_charsets {
93 let confidence = self.calculate_encoding_confidence(data, charset)?;
94 if confidence > 0.1 {
95 candidates.push(EncodingCandidate {
96 charset: charset.clone(),
97 confidence,
98 });
99 }
100 }
101 candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
102 Ok(candidates)
103 }
104
105 fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
106 match charset {
107 Charset::Utf8 => self.check_utf8_validity(data),
108 Charset::Gbk => self.check_gbk_patterns(data),
109 Charset::ShiftJis => self.check_shift_jis_patterns(data),
110 Charset::Big5 => self.check_big5_patterns(data),
111 Charset::Iso88591 => self.check_iso88591_patterns(data),
112 Charset::Windows1252 => self.check_windows1252_patterns(data),
113 _ => Ok(0.0),
114 }
115 }
116
117 fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
118 let mut valid_chars = 0;
119 let mut total_chars = 0;
120 let mut i = 0;
121
122 while i < data.len() {
123 total_chars += 1;
124 if data[i] & 0x80 == 0 {
125 valid_chars += 1;
126 i += 1;
127 } else if data[i] & 0xE0 == 0xC0 {
128 if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
129 valid_chars += 1;
130 }
131 i += 2;
132 } else if data[i] & 0xF0 == 0xE0 {
133 if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
134 valid_chars += 1;
135 }
136 i += 3;
137 } else if data[i] & 0xF8 == 0xF0 {
138 if i + 3 < data.len()
139 && data[i + 1] & 0xC0 == 0x80
140 && data[i + 2] & 0xC0 == 0x80
141 && data[i + 3] & 0xC0 == 0x80
142 {
143 valid_chars += 1;
144 }
145 i += 4;
146 } else {
147 i += 1;
148 }
149 }
150
151 Ok(if total_chars > 0 {
152 valid_chars as f32 / total_chars as f32
153 } else {
154 0.0
155 })
156 }
157
158 fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
159 let mut valid_chars = 0;
160 let mut total_chars = 0;
161 let mut i = 0;
162
163 while i < data.len() {
164 if data[i] < 0x80 {
165 valid_chars += 1;
166 total_chars += 1;
167 i += 1;
168 } else if i + 1 < data.len() {
169 let byte1 = data[i];
170 let byte2 = data[i + 1];
171 if (0x81..=0xFE).contains(&byte1)
172 && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
173 {
174 valid_chars += 1;
175 }
176 total_chars += 1;
177 i += 2;
178 } else {
179 total_chars += 1;
180 i += 1;
181 }
182 }
183
184 Ok(if total_chars > 0 {
185 valid_chars as f32 / total_chars as f32
186 } else {
187 0.0
188 })
189 }
190
191 fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
192 let mut valid_chars = 0;
193 let mut total_chars = 0;
194 let mut i = 0;
195
196 while i < data.len() {
197 if data[i] < 0x80 {
198 valid_chars += 1;
199 total_chars += 1;
200 i += 1;
201 } else if i + 1 < data.len() {
202 let byte1 = data[i];
203 let byte2 = data[i + 1];
204 if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
205 && (0x40..=0xFC).contains(&byte2)
206 && byte2 != 0x7F
207 {
208 valid_chars += 1;
209 }
210 total_chars += 1;
211 i += 2;
212 } else {
213 total_chars += 1;
214 i += 1;
215 }
216 }
217
218 Ok(if total_chars > 0 {
219 valid_chars as f32 / total_chars as f32
220 } else {
221 0.0
222 })
223 }
224
225 fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
226 let mut valid_chars = 0;
227 let mut total_chars = 0;
228 let mut i = 0;
229
230 while i < data.len() {
231 if data[i] < 0x80 {
232 valid_chars += 1;
233 total_chars += 1;
234 i += 1;
235 } else if i + 1 < data.len() {
236 let byte1 = data[i];
237 let byte2 = data[i + 1];
238 if (0xA1..=0xFE).contains(&byte1)
239 && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
240 {
241 valid_chars += 1;
242 }
243 total_chars += 1;
244 i += 2;
245 } else {
246 total_chars += 1;
247 i += 1;
248 }
249 }
250
251 Ok(if total_chars > 0 {
252 valid_chars as f32 / total_chars as f32
253 } else {
254 0.0
255 })
256 }
257
258 fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
259 let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
260 let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
261 if extended_count > 0 {
262 let utf8_conf = self.check_utf8_validity(data)?;
263 Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
264 } else {
265 Ok(0.5)
266 }
267 }
268
269 fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
270 let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
271 let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
272 if control_chars > 0 || extended_chars > 0 {
273 let utf8_conf = self.check_utf8_validity(data)?;
274 Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
275 } else {
276 Ok(0.3)
277 }
278 }
279
280 fn select_best_encoding(
281 &self,
282 candidates: Vec<EncodingCandidate>,
283 data: &[u8],
284 ) -> Result<EncodingInfo> {
285 if candidates.is_empty() {
286 return Ok(EncodingInfo {
287 charset: Charset::Unknown,
288 confidence: 0.0,
289 bom_detected: false,
290 sample_text: String::from("Unable to detect encoding"),
291 });
292 }
293 let best = &candidates[0];
294 if best.confidence < self.confidence_threshold {
295 let config = load_config()?;
296 return Ok(EncodingInfo {
297 charset: Charset::Utf8,
298 confidence: 0.5,
299 bom_detected: false,
300 sample_text: format!(
301 "Using default encoding: {}",
302 config.formats.default_encoding
303 ),
304 });
305 }
306 let sample = self.decode_sample(data, &best.charset)?;
307 Ok(EncodingInfo {
308 charset: best.charset.clone(),
309 confidence: best.confidence,
310 bom_detected: false,
311 sample_text: sample,
312 })
313 }
314
315 fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
316 let sample_size = data.len().min(200);
317 let sample_data = &data[0..sample_size];
318 match charset {
319 Charset::Utf8 => String::from_utf8(sample_data.to_vec())
320 .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
321 _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
322 }
323 }
324
325 fn default_charsets() -> Vec<Charset> {
326 vec![
327 Charset::Utf8,
328 Charset::Gbk,
329 Charset::ShiftJis,
330 Charset::Big5,
331 Charset::Iso88591,
332 Charset::Windows1252,
333 ]
334 }
335}
336
337#[derive(Debug, Clone)]
338struct EncodingCandidate {
339 charset: Charset,
340 confidence: f32,
341}
342
343impl Default for EncodingDetector {
344 fn default() -> Self {
345 Self::new().unwrap_or(Self {
346 confidence_threshold: 0.7,
347 max_sample_size: 8192,
348 supported_charsets: Self::default_charsets(),
349 })
350 }
351}