subx_cli/core/formats/encoding/
detector.rs1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use std::fs::File;
4use std::io::Read;
5
6pub struct EncodingDetector {
8 confidence_threshold: f32,
9 max_sample_size: usize,
10 supported_charsets: Vec<Charset>,
11}
12
13impl EncodingDetector {
14 pub fn new(config: &crate::config::Config) -> Self {
16 Self {
17 confidence_threshold: config.formats.encoding_detection_confidence,
18 max_sample_size: 8192,
19 supported_charsets: Self::default_charsets(),
20 }
21 }
22
23 pub fn with_defaults() -> Self {
25 Self {
26 confidence_threshold: 0.8, max_sample_size: 8192,
28 supported_charsets: Self::default_charsets(),
29 }
30 }
31
32 pub fn with_config(config: &crate::config::Config) -> Self {
34 Self {
35 confidence_threshold: config.formats.encoding_detection_confidence,
36 max_sample_size: 8192,
37 supported_charsets: Self::default_charsets(),
38 }
39 }
40
41 pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
43 let mut file = File::open(file_path)?;
44 let mut buffer = vec![0; self.max_sample_size];
45 let bytes_read = file.read(&mut buffer)?;
46 buffer.truncate(bytes_read);
47 self.detect_encoding(&buffer)
48 }
49
50 pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
52 if let Some(encoding) = self.detect_bom(data) {
53 return Ok(encoding);
54 }
55 let candidates = self.analyze_byte_patterns(data)?;
56 self.select_best_encoding(candidates, data)
57 }
58
59 fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
60 if data.len() < 3 {
61 return None;
62 }
63 match &data[0..3] {
64 [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
65 charset: Charset::Utf8,
66 confidence: 1.0,
67 bom_detected: true,
68 sample_text: String::from("UTF-8 with BOM"),
69 }),
70 [0xFF, 0xFE, ..] => Some(EncodingInfo {
71 charset: Charset::Utf16Le,
72 confidence: 1.0,
73 bom_detected: true,
74 sample_text: String::from("UTF-16 LE with BOM"),
75 }),
76 [0xFE, 0xFF, ..] => Some(EncodingInfo {
77 charset: Charset::Utf16Be,
78 confidence: 1.0,
79 bom_detected: true,
80 sample_text: String::from("UTF-16 BE with BOM"),
81 }),
82 _ => {
83 if data.len() >= 4 {
84 match &data[0..4] {
85 [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
86 charset: Charset::Utf32Le,
87 confidence: 1.0,
88 bom_detected: true,
89 sample_text: String::from("UTF-32 LE with BOM"),
90 }),
91 [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
92 charset: Charset::Utf32Be,
93 confidence: 1.0,
94 bom_detected: true,
95 sample_text: String::from("UTF-32 BE with BOM"),
96 }),
97 _ => None,
98 }
99 } else {
100 None
101 }
102 }
103 }
104 }
105
106 fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
107 let mut candidates = Vec::new();
108 for charset in &self.supported_charsets {
109 let confidence = self.calculate_encoding_confidence(data, charset)?;
110 if confidence > 0.1 {
111 candidates.push(EncodingCandidate {
112 charset: charset.clone(),
113 confidence,
114 });
115 }
116 }
117 candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
118 Ok(candidates)
119 }
120
121 fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
122 match charset {
123 Charset::Utf8 => self.check_utf8_validity(data),
124 Charset::Gbk => self.check_gbk_patterns(data),
125 Charset::ShiftJis => self.check_shift_jis_patterns(data),
126 Charset::Big5 => self.check_big5_patterns(data),
127 Charset::Iso88591 => self.check_iso88591_patterns(data),
128 Charset::Windows1252 => self.check_windows1252_patterns(data),
129 _ => Ok(0.0),
130 }
131 }
132
133 fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
134 let mut valid_chars = 0;
135 let mut total_chars = 0;
136 let mut i = 0;
137
138 while i < data.len() {
139 total_chars += 1;
140 if data[i] & 0x80 == 0 {
141 valid_chars += 1;
142 i += 1;
143 } else if data[i] & 0xE0 == 0xC0 {
144 if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
145 valid_chars += 1;
146 }
147 i += 2;
148 } else if data[i] & 0xF0 == 0xE0 {
149 if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
150 valid_chars += 1;
151 }
152 i += 3;
153 } else if data[i] & 0xF8 == 0xF0 {
154 if i + 3 < data.len()
155 && data[i + 1] & 0xC0 == 0x80
156 && data[i + 2] & 0xC0 == 0x80
157 && data[i + 3] & 0xC0 == 0x80
158 {
159 valid_chars += 1;
160 }
161 i += 4;
162 } else {
163 i += 1;
164 }
165 }
166
167 Ok(if total_chars > 0 {
168 valid_chars as f32 / total_chars as f32
169 } else {
170 0.0
171 })
172 }
173
174 fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
175 let mut valid_chars = 0;
176 let mut total_chars = 0;
177 let mut i = 0;
178
179 while i < data.len() {
180 if data[i] < 0x80 {
181 valid_chars += 1;
182 total_chars += 1;
183 i += 1;
184 } else if i + 1 < data.len() {
185 let byte1 = data[i];
186 let byte2 = data[i + 1];
187 if (0x81..=0xFE).contains(&byte1)
188 && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
189 {
190 valid_chars += 1;
191 }
192 total_chars += 1;
193 i += 2;
194 } else {
195 total_chars += 1;
196 i += 1;
197 }
198 }
199
200 Ok(if total_chars > 0 {
201 valid_chars as f32 / total_chars as f32
202 } else {
203 0.0
204 })
205 }
206
207 fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
208 let mut valid_chars = 0;
209 let mut total_chars = 0;
210 let mut i = 0;
211
212 while i < data.len() {
213 if data[i] < 0x80 {
214 valid_chars += 1;
215 total_chars += 1;
216 i += 1;
217 } else if i + 1 < data.len() {
218 let byte1 = data[i];
219 let byte2 = data[i + 1];
220 if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
221 && (0x40..=0xFC).contains(&byte2)
222 && byte2 != 0x7F
223 {
224 valid_chars += 1;
225 }
226 total_chars += 1;
227 i += 2;
228 } else {
229 total_chars += 1;
230 i += 1;
231 }
232 }
233
234 Ok(if total_chars > 0 {
235 valid_chars as f32 / total_chars as f32
236 } else {
237 0.0
238 })
239 }
240
241 fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
242 let mut valid_chars = 0;
243 let mut total_chars = 0;
244 let mut i = 0;
245
246 while i < data.len() {
247 if data[i] < 0x80 {
248 valid_chars += 1;
249 total_chars += 1;
250 i += 1;
251 } else if i + 1 < data.len() {
252 let byte1 = data[i];
253 let byte2 = data[i + 1];
254 if (0xA1..=0xFE).contains(&byte1)
255 && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
256 {
257 valid_chars += 1;
258 }
259 total_chars += 1;
260 i += 2;
261 } else {
262 total_chars += 1;
263 i += 1;
264 }
265 }
266
267 Ok(if total_chars > 0 {
268 valid_chars as f32 / total_chars as f32
269 } else {
270 0.0
271 })
272 }
273
274 fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
275 let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
276 let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
277 if extended_count > 0 {
278 let utf8_conf = self.check_utf8_validity(data)?;
279 Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
280 } else {
281 Ok(0.5)
282 }
283 }
284
285 fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
286 let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
287 let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
288 if control_chars > 0 || extended_chars > 0 {
289 let utf8_conf = self.check_utf8_validity(data)?;
290 Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
291 } else {
292 Ok(0.3)
293 }
294 }
295
296 fn select_best_encoding(
297 &self,
298 candidates: Vec<EncodingCandidate>,
299 data: &[u8],
300 ) -> Result<EncodingInfo> {
301 if candidates.is_empty() {
302 return Ok(EncodingInfo {
303 charset: Charset::Unknown,
304 confidence: 0.0,
305 bom_detected: false,
306 sample_text: String::from("Unable to detect encoding"),
307 });
308 }
309 let best = &candidates[0];
310 if best.confidence < self.confidence_threshold {
311 return Ok(EncodingInfo {
312 charset: Charset::Utf8,
313 confidence: 0.5,
314 bom_detected: false,
315 sample_text: "Using default encoding: UTF-8".to_string(),
316 });
317 }
318 let sample = self.decode_sample(data, &best.charset)?;
319 Ok(EncodingInfo {
320 charset: best.charset.clone(),
321 confidence: best.confidence,
322 bom_detected: false,
323 sample_text: sample,
324 })
325 }
326
327 fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
328 let sample_size = data.len().min(200);
329 let sample_data = &data[0..sample_size];
330 match charset {
331 Charset::Utf8 => String::from_utf8(sample_data.to_vec())
332 .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
333 _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
334 }
335 }
336
337 fn default_charsets() -> Vec<Charset> {
338 vec![
339 Charset::Utf8,
340 Charset::Gbk,
341 Charset::ShiftJis,
342 Charset::Big5,
343 Charset::Iso88591,
344 Charset::Windows1252,
345 ]
346 }
347}
348
349#[derive(Debug, Clone)]
350struct EncodingCandidate {
351 charset: Charset,
352 confidence: f32,
353}
354
355impl Default for EncodingDetector {
356 fn default() -> Self {
357 Self::with_defaults()
358 }
359}
360
361#[cfg(test)]
362mod tests {
363 use super::*;
364 use std::fs;
365 use tempfile::TempDir;
366
367 fn create_test_detector() -> EncodingDetector {
368 EncodingDetector {
369 confidence_threshold: 0.7,
370 max_sample_size: 8192,
371 supported_charsets: EncodingDetector::default_charsets(),
372 }
373 }
374
375 #[test]
377 fn test_utf8_detection_accuracy() {
378 let detector = create_test_detector();
379 let utf8_text = "Hello, 世界! Bonjour, monde! 🌍";
380
381 let result = detector.detect_encoding(utf8_text.as_bytes()).unwrap();
382
383 assert_eq!(result.charset, Charset::Utf8);
384 assert!(result.confidence > 0.8);
385 assert!(!result.bom_detected);
386 assert!(result.sample_text.contains("Hello"));
387 }
388
389 #[test]
391 fn test_utf8_bom_detection() {
392 let detector = create_test_detector();
393 let mut bom_data = vec![0xEF, 0xBB, 0xBF]; bom_data.extend_from_slice("Hello, World!".as_bytes());
395
396 let result = detector.detect_encoding(&bom_data).unwrap();
397
398 assert_eq!(result.charset, Charset::Utf8);
399 assert_eq!(result.confidence, 1.0);
400 assert!(result.bom_detected);
401 assert_eq!(result.sample_text, "UTF-8 with BOM");
402 }
403
404 #[test]
406 fn test_utf16_bom_detection() {
407 let detector = create_test_detector();
408
409 let utf16le_data = vec![0xFF, 0xFE, 0x48, 0x00, 0x65, 0x00]; let result = detector.detect_encoding(&utf16le_data).unwrap();
412 assert_eq!(result.charset, Charset::Utf16Le);
413 assert!(result.bom_detected);
414
415 let utf16be_data = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65]; let result = detector.detect_encoding(&utf16be_data).unwrap();
418 assert_eq!(result.charset, Charset::Utf16Be);
419 assert!(result.bom_detected);
420 }
421
422 #[test]
424 fn test_file_encoding_detection() {
425 let detector = create_test_detector();
426 let temp_dir = TempDir::new().unwrap();
427
428 let utf8_path = temp_dir.path().join("utf8.txt");
430 fs::write(&utf8_path, "測試檔案編碼檢測功能。").unwrap();
431
432 let result = detector
433 .detect_file_encoding(utf8_path.to_str().unwrap())
434 .unwrap();
435
436 assert_eq!(result.charset, Charset::Utf8);
437 assert!(result.confidence > 0.7);
438 }
439
440 #[test]
442 fn test_nonexistent_file_error() {
443 let detector = create_test_detector();
444 let result = detector.detect_file_encoding("nonexistent.txt");
445
446 assert!(result.is_err());
447 }
448
449 #[test]
451 fn test_gbk_pattern_detection() {
452 let detector = create_test_detector();
453
454 let gbk_pattern = vec![
456 0xC4, 0xE3, 0xBA, 0xC3, 0xCA, 0xC0, 0xBD, 0xE7, ];
459
460 let result = detector.detect_encoding(&gbk_pattern).unwrap();
461
462 assert!(result.confidence > 0.3);
464 if result.charset == Charset::Gbk {
465 assert!(result.confidence > 0.5);
466 }
467 }
468
469 #[test]
471 fn test_shift_jis_detection() {
472 let detector = create_test_detector();
473
474 let shift_jis_pattern = vec![
476 0x82, 0xB1, 0x82, 0xF1, 0x82, 0xB1, 0x82, 0xF1, 0x82, 0xC9, 0x82, 0xBF, ];
480
481 let result = detector.detect_encoding(&shift_jis_pattern).unwrap();
482
483 assert!(result.confidence > 0.2);
485 }
486
487 #[test]
489 fn test_encoding_confidence_ranking() {
490 let detector = create_test_detector();
491
492 let clear_utf8 = "Clear English text with numbers 123.";
494 let utf8_result = detector.detect_encoding(clear_utf8.as_bytes()).unwrap();
495
496 let ambiguous_data: Vec<u8> = (0x80..=0xFF).cycle().take(50).collect();
498 let ambiguous_result = detector.detect_encoding(&ambiguous_data).unwrap();
499
500 assert!(utf8_result.confidence > ambiguous_result.confidence);
501 }
502
503 #[test]
505 fn test_max_sample_size_limit() {
506 let detector = create_test_detector();
507
508 let large_data = vec![b'A'; 10000]; let result = detector.detect_encoding(&large_data).unwrap();
511
512 assert_eq!(result.charset, Charset::Utf8);
514 assert!(result.confidence > 0.9);
515 }
516
517 #[test]
519 fn test_encoding_candidate_selection() {
520 let detector = create_test_detector();
521
522 let mut mixed_data = b"English text ".to_vec();
524 mixed_data.extend_from_slice(&[0xC3, 0xA9]); mixed_data.extend_from_slice(b" and more text");
526
527 let result = detector.detect_encoding(&mixed_data).unwrap();
528
529 assert_eq!(result.charset, Charset::Utf8);
531 assert!(result.confidence > 0.7);
532 }
533
534 #[test]
536 fn test_unknown_encoding_fallback() {
537 let detector = create_test_detector();
538
539 let random_data: Vec<u8> = (0..100).map(|i| (i * 7 + 13) as u8).collect();
541 let result = detector.detect_encoding(&random_data).unwrap();
542
543 assert!(result.confidence >= 0.0);
545 assert!(result.confidence <= 1.0);
546 }
547
548 #[test]
550 fn test_detection_performance() {
551 let detector = create_test_detector();
552
553 let large_text = "Hello, World! ".repeat(500);
555
556 let start = std::time::Instant::now();
557 let _result = detector.detect_encoding(large_text.as_bytes()).unwrap();
558 let duration = start.elapsed();
559
560 assert!(duration.as_millis() < 100);
562 }
563}