1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use std::fs::File;
4use std::io::Read;
5
6pub struct EncodingDetector {
8 confidence_threshold: f32,
9 max_sample_size: usize,
10 supported_charsets: Vec<Charset>,
11 default_encoding: String,
12}
13
14impl EncodingDetector {
15 pub fn new(config: &crate::config::Config) -> Self {
17 Self {
18 confidence_threshold: config.formats.encoding_detection_confidence,
19 max_sample_size: 8192,
20 supported_charsets: Self::default_charsets(),
21 default_encoding: config.formats.default_encoding.clone(),
22 }
23 }
24
25 pub fn with_defaults() -> Self {
27 Self {
28 confidence_threshold: 0.8, max_sample_size: 8192,
30 supported_charsets: Self::default_charsets(),
31 default_encoding: "utf-8".to_string(),
32 }
33 }
34
35 pub fn with_config(config: &crate::config::Config) -> Self {
37 Self {
38 confidence_threshold: config.formats.encoding_detection_confidence,
39 max_sample_size: 8192,
40 supported_charsets: Self::default_charsets(),
41 default_encoding: config.formats.default_encoding.clone(),
42 }
43 }
44
45 pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
47 let mut file = File::open(file_path)?;
48 let mut buffer = vec![0; self.max_sample_size];
49 let bytes_read = file.read(&mut buffer)?;
50 buffer.truncate(bytes_read);
51 self.detect_encoding(&buffer)
52 }
53
54 pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
56 if let Some(encoding) = self.detect_bom(data) {
57 return Ok(encoding);
58 }
59 let candidates = self.analyze_byte_patterns(data)?;
60 self.select_best_encoding(candidates, data)
61 }
62
63 fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
64 if data.len() < 3 {
65 return None;
66 }
67 match &data[0..3] {
68 [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
69 charset: Charset::Utf8,
70 confidence: 1.0,
71 bom_detected: true,
72 sample_text: String::from("UTF-8 with BOM"),
73 }),
74 [0xFF, 0xFE, ..] => Some(EncodingInfo {
75 charset: Charset::Utf16Le,
76 confidence: 1.0,
77 bom_detected: true,
78 sample_text: String::from("UTF-16 LE with BOM"),
79 }),
80 [0xFE, 0xFF, ..] => Some(EncodingInfo {
81 charset: Charset::Utf16Be,
82 confidence: 1.0,
83 bom_detected: true,
84 sample_text: String::from("UTF-16 BE with BOM"),
85 }),
86 _ => {
87 if data.len() >= 4 {
88 match &data[0..4] {
89 [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
90 charset: Charset::Utf32Le,
91 confidence: 1.0,
92 bom_detected: true,
93 sample_text: String::from("UTF-32 LE with BOM"),
94 }),
95 [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
96 charset: Charset::Utf32Be,
97 confidence: 1.0,
98 bom_detected: true,
99 sample_text: String::from("UTF-32 BE with BOM"),
100 }),
101 _ => None,
102 }
103 } else {
104 None
105 }
106 }
107 }
108 }
109
110 fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
111 let mut candidates = Vec::new();
112 for charset in &self.supported_charsets {
113 let confidence = self.calculate_encoding_confidence(data, charset)?;
114 if confidence > 0.1 {
115 candidates.push(EncodingCandidate {
116 charset: charset.clone(),
117 confidence,
118 });
119 }
120 }
121 candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
122 Ok(candidates)
123 }
124
125 fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
126 match charset {
127 Charset::Utf8 => self.check_utf8_validity(data),
128 Charset::Gbk => self.check_gbk_patterns(data),
129 Charset::ShiftJis => self.check_shift_jis_patterns(data),
130 Charset::Big5 => self.check_big5_patterns(data),
131 Charset::Iso88591 => self.check_iso88591_patterns(data),
132 Charset::Windows1252 => self.check_windows1252_patterns(data),
133 _ => Ok(0.0),
134 }
135 }
136
137 fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
138 let mut valid_chars = 0;
139 let mut total_chars = 0;
140 let mut i = 0;
141
142 while i < data.len() {
143 total_chars += 1;
144 if data[i] & 0x80 == 0 {
145 valid_chars += 1;
146 i += 1;
147 } else if data[i] & 0xE0 == 0xC0 {
148 if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
149 valid_chars += 1;
150 }
151 i += 2;
152 } else if data[i] & 0xF0 == 0xE0 {
153 if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
154 valid_chars += 1;
155 }
156 i += 3;
157 } else if data[i] & 0xF8 == 0xF0 {
158 if i + 3 < data.len()
159 && data[i + 1] & 0xC0 == 0x80
160 && data[i + 2] & 0xC0 == 0x80
161 && data[i + 3] & 0xC0 == 0x80
162 {
163 valid_chars += 1;
164 }
165 i += 4;
166 } else {
167 i += 1;
168 }
169 }
170
171 Ok(if total_chars > 0 {
172 valid_chars as f32 / total_chars as f32
173 } else {
174 0.0
175 })
176 }
177
178 fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
179 let mut valid_chars = 0;
180 let mut total_chars = 0;
181 let mut i = 0;
182
183 while i < data.len() {
184 if data[i] < 0x80 {
185 valid_chars += 1;
186 total_chars += 1;
187 i += 1;
188 } else if i + 1 < data.len() {
189 let byte1 = data[i];
190 let byte2 = data[i + 1];
191 if (0x81..=0xFE).contains(&byte1)
192 && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
193 {
194 valid_chars += 1;
195 }
196 total_chars += 1;
197 i += 2;
198 } else {
199 total_chars += 1;
200 i += 1;
201 }
202 }
203
204 Ok(if total_chars > 0 {
205 valid_chars as f32 / total_chars as f32
206 } else {
207 0.0
208 })
209 }
210
211 fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
212 let mut valid_chars = 0;
213 let mut total_chars = 0;
214 let mut i = 0;
215
216 while i < data.len() {
217 if data[i] < 0x80 {
218 valid_chars += 1;
219 total_chars += 1;
220 i += 1;
221 } else if i + 1 < data.len() {
222 let byte1 = data[i];
223 let byte2 = data[i + 1];
224 if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
225 && (0x40..=0xFC).contains(&byte2)
226 && byte2 != 0x7F
227 {
228 valid_chars += 1;
229 }
230 total_chars += 1;
231 i += 2;
232 } else {
233 total_chars += 1;
234 i += 1;
235 }
236 }
237
238 Ok(if total_chars > 0 {
239 valid_chars as f32 / total_chars as f32
240 } else {
241 0.0
242 })
243 }
244
245 fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
246 let mut valid_chars = 0;
247 let mut total_chars = 0;
248 let mut i = 0;
249
250 while i < data.len() {
251 if data[i] < 0x80 {
252 valid_chars += 1;
253 total_chars += 1;
254 i += 1;
255 } else if i + 1 < data.len() {
256 let byte1 = data[i];
257 let byte2 = data[i + 1];
258 if (0xA1..=0xFE).contains(&byte1)
259 && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
260 {
261 valid_chars += 1;
262 }
263 total_chars += 1;
264 i += 2;
265 } else {
266 total_chars += 1;
267 i += 1;
268 }
269 }
270
271 Ok(if total_chars > 0 {
272 valid_chars as f32 / total_chars as f32
273 } else {
274 0.0
275 })
276 }
277
278 fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
279 let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
280 let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
281 if extended_count > 0 {
282 let utf8_conf = self.check_utf8_validity(data)?;
283 Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
284 } else {
285 Ok(0.5)
286 }
287 }
288
289 fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
290 let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
291 let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
292 if control_chars > 0 || extended_chars > 0 {
293 let utf8_conf = self.check_utf8_validity(data)?;
294 Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
295 } else {
296 Ok(0.3)
297 }
298 }
299
300 fn select_best_encoding(
301 &self,
302 candidates: Vec<EncodingCandidate>,
303 data: &[u8],
304 ) -> Result<EncodingInfo> {
305 if candidates.is_empty() {
306 let default_charset = self.parse_charset_name(&self.default_encoding);
307 let sample = self.decode_sample(data, &default_charset)?;
308 return Ok(EncodingInfo {
309 charset: default_charset,
310 confidence: 0.1,
311 bom_detected: false,
312 sample_text: format!(
313 "Unable to detect encoding, using default: {} (sample: {})",
314 self.default_encoding,
315 sample.chars().take(50).collect::<String>()
316 ),
317 });
318 }
319 let best = &candidates[0];
320 if best.confidence < self.confidence_threshold {
321 let default_charset = self.parse_charset_name(&self.default_encoding);
322 let sample = self.decode_sample(data, &default_charset)?;
323 return Ok(EncodingInfo {
324 charset: default_charset,
325 confidence: 0.5,
326 bom_detected: false,
327 sample_text: format!(
328 "Low confidence detection, using default: {} (sample: {})",
329 self.default_encoding,
330 sample.chars().take(50).collect::<String>()
331 ),
332 });
333 }
334 let sample = self.decode_sample(data, &best.charset)?;
335 Ok(EncodingInfo {
336 charset: best.charset.clone(),
337 confidence: best.confidence,
338 bom_detected: false,
339 sample_text: sample,
340 })
341 }
342
343 fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
344 let sample_size = data.len().min(200);
345 let sample_data = &data[0..sample_size];
346 match charset {
347 Charset::Utf8 => String::from_utf8(sample_data.to_vec())
348 .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
349 _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
350 }
351 }
352
353 fn default_charsets() -> Vec<Charset> {
354 vec![
355 Charset::Utf8,
356 Charset::Gbk,
357 Charset::ShiftJis,
358 Charset::Big5,
359 Charset::Iso88591,
360 Charset::Windows1252,
361 ]
362 }
363
364 fn parse_charset_name(&self, encoding_name: &str) -> Charset {
366 match encoding_name.to_lowercase().as_str() {
367 "utf-8" | "utf8" => Charset::Utf8,
368 "utf-16le" | "utf16le" => Charset::Utf16Le,
369 "utf-16be" | "utf16be" => Charset::Utf16Be,
370 "utf-32le" | "utf32le" => Charset::Utf32Le,
371 "utf-32be" | "utf32be" => Charset::Utf32Be,
372 "gbk" | "gb2312" => Charset::Gbk,
373 "shift-jis" | "shift_jis" | "sjis" => Charset::ShiftJis,
374 "iso-8859-1" | "iso88591" | "latin1" => Charset::Iso88591,
375 "windows-1252" | "windows1252" | "cp1252" => Charset::Windows1252,
376 "big5" => Charset::Big5,
377 "euc-kr" | "euckr" => Charset::Euckr,
378 _ => Charset::Utf8, }
380 }
381}
382
383#[derive(Debug, Clone)]
384struct EncodingCandidate {
385 charset: Charset,
386 confidence: f32,
387}
388
389impl Default for EncodingDetector {
390 fn default() -> Self {
391 Self::with_defaults()
392 }
393}
394
395#[cfg(test)]
396mod tests {
397 use super::*;
398 use std::fs;
399 use tempfile::TempDir;
400
401 fn create_test_detector() -> EncodingDetector {
402 EncodingDetector {
403 confidence_threshold: 0.7,
404 max_sample_size: 8192,
405 supported_charsets: EncodingDetector::default_charsets(),
406 default_encoding: "utf-8".to_string(),
407 }
408 }
409
410 #[test]
412 fn test_utf8_detection_accuracy() {
413 let detector = create_test_detector();
414 let utf8_text = "Hello, 世界! Bonjour, monde! 🌍";
415
416 let result = detector.detect_encoding(utf8_text.as_bytes()).unwrap();
417
418 assert_eq!(result.charset, Charset::Utf8);
419 assert!(result.confidence > 0.8);
420 assert!(!result.bom_detected);
421 assert!(result.sample_text.contains("Hello"));
422 }
423
424 #[test]
426 fn test_utf8_bom_detection() {
427 let detector = create_test_detector();
428 let mut bom_data = vec![0xEF, 0xBB, 0xBF]; bom_data.extend_from_slice("Hello, World!".as_bytes());
430
431 let result = detector.detect_encoding(&bom_data).unwrap();
432
433 assert_eq!(result.charset, Charset::Utf8);
434 assert_eq!(result.confidence, 1.0);
435 assert!(result.bom_detected);
436 assert_eq!(result.sample_text, "UTF-8 with BOM");
437 }
438
439 #[test]
441 fn test_utf16_bom_detection() {
442 let detector = create_test_detector();
443
444 let utf16le_data = vec![0xFF, 0xFE, 0x48, 0x00, 0x65, 0x00]; let result = detector.detect_encoding(&utf16le_data).unwrap();
447 assert_eq!(result.charset, Charset::Utf16Le);
448 assert!(result.bom_detected);
449
450 let utf16be_data = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65]; let result = detector.detect_encoding(&utf16be_data).unwrap();
453 assert_eq!(result.charset, Charset::Utf16Be);
454 assert!(result.bom_detected);
455 }
456
457 #[test]
459 fn test_file_encoding_detection() {
460 let detector = create_test_detector();
461 let temp_dir = TempDir::new().unwrap();
462
463 let utf8_path = temp_dir.path().join("utf8.txt");
465 fs::write(&utf8_path, "測試檔案編碼檢測功能。").unwrap();
466
467 let result = detector
468 .detect_file_encoding(utf8_path.to_str().unwrap())
469 .unwrap();
470
471 assert_eq!(result.charset, Charset::Utf8);
472 assert!(result.confidence > 0.7);
473 }
474
475 #[test]
477 fn test_nonexistent_file_error() {
478 let detector = create_test_detector();
479 let result = detector.detect_file_encoding("nonexistent.txt");
480
481 assert!(result.is_err());
482 }
483
484 #[test]
486 fn test_gbk_pattern_detection() {
487 let detector = create_test_detector();
488
489 let gbk_pattern = vec![
491 0xC4, 0xE3, 0xBA, 0xC3, 0xCA, 0xC0, 0xBD, 0xE7, ];
494
495 let result = detector.detect_encoding(&gbk_pattern).unwrap();
496
497 assert!(result.confidence > 0.3);
499 if result.charset == Charset::Gbk {
500 assert!(result.confidence > 0.5);
501 }
502 }
503
504 #[test]
506 fn test_shift_jis_detection() {
507 let detector = create_test_detector();
508
509 let shift_jis_pattern = vec![
511 0x82, 0xB1, 0x82, 0xF1, 0x82, 0xB1, 0x82, 0xF1, 0x82, 0xC9, 0x82, 0xBF, ];
515
516 let result = detector.detect_encoding(&shift_jis_pattern).unwrap();
517
518 assert!(result.confidence > 0.2);
520 }
521
522 #[test]
524 fn test_encoding_confidence_ranking() {
525 let detector = create_test_detector();
526
527 let clear_utf8 = "Clear English text with numbers 123.";
529 let utf8_result = detector.detect_encoding(clear_utf8.as_bytes()).unwrap();
530
531 let ambiguous_data: Vec<u8> = (0x80..=0xFF).cycle().take(50).collect();
533 let ambiguous_result = detector.detect_encoding(&ambiguous_data).unwrap();
534
535 assert!(utf8_result.confidence > ambiguous_result.confidence);
536 }
537
538 #[test]
540 fn test_max_sample_size_limit() {
541 let detector = create_test_detector();
542
543 let large_data = vec![b'A'; 10000]; let result = detector.detect_encoding(&large_data).unwrap();
546
547 assert_eq!(result.charset, Charset::Utf8);
549 assert!(result.confidence > 0.9);
550 }
551
552 #[test]
554 fn test_encoding_candidate_selection() {
555 let detector = create_test_detector();
556
557 let mut mixed_data = b"English text ".to_vec();
559 mixed_data.extend_from_slice(&[0xC3, 0xA9]); mixed_data.extend_from_slice(b" and more text");
561
562 let result = detector.detect_encoding(&mixed_data).unwrap();
563
564 assert_eq!(result.charset, Charset::Utf8);
566 assert!(result.confidence > 0.7);
567 }
568
569 #[test]
571 fn test_unknown_encoding_fallback() {
572 let detector = create_test_detector();
573
574 let random_data: Vec<u8> = (0..100).map(|i| (i * 7 + 13) as u8).collect();
576 let result = detector.detect_encoding(&random_data).unwrap();
577
578 assert!(result.confidence >= 0.0);
580 assert!(result.confidence <= 1.0);
581 }
582
583 #[test]
585 fn test_detection_performance() {
586 let detector = create_test_detector();
587
588 let large_text = "Hello, World! ".repeat(500);
590
591 let start = std::time::Instant::now();
592 let _result = detector.detect_encoding(large_text.as_bytes()).unwrap();
593 let duration = start.elapsed();
594
595 assert!(duration.as_millis() < 100);
597 }
598
599 #[test]
601 fn test_default_encoding_usage() {
602 let mut detector = EncodingDetector {
604 confidence_threshold: 0.95, max_sample_size: 8192,
606 supported_charsets: EncodingDetector::default_charsets(),
607 default_encoding: "gbk".to_string(),
608 };
609
610 let ambiguous_data = vec![0x80, 0x81, 0x82, 0x83, 0x84, 0x85];
613 let result = detector.detect_encoding(&ambiguous_data).unwrap();
614
615 assert_eq!(result.charset, Charset::Gbk);
617 assert!(result.sample_text.contains("gbk") || result.sample_text.contains("default"));
618 assert!(result.confidence < 0.95); detector.default_encoding = "utf-16le".to_string();
622 let result = detector.detect_encoding(&ambiguous_data).unwrap();
623 assert_eq!(result.charset, Charset::Utf16Le);
624 assert!(result.sample_text.contains("utf-16le") || result.sample_text.contains("default"));
625 }
626
627 #[test]
629 fn test_encoding_name_parsing() {
630 let detector = create_test_detector();
631
632 assert_eq!(detector.parse_charset_name("utf-8"), Charset::Utf8);
634 assert_eq!(detector.parse_charset_name("UTF8"), Charset::Utf8);
635 assert_eq!(detector.parse_charset_name("gbk"), Charset::Gbk);
636 assert_eq!(detector.parse_charset_name("GBK"), Charset::Gbk);
637 assert_eq!(detector.parse_charset_name("shift-jis"), Charset::ShiftJis);
638 assert_eq!(detector.parse_charset_name("SHIFT_JIS"), Charset::ShiftJis);
639 assert_eq!(detector.parse_charset_name("big5"), Charset::Big5);
640 assert_eq!(detector.parse_charset_name("iso-8859-1"), Charset::Iso88591);
641 assert_eq!(
642 detector.parse_charset_name("windows-1252"),
643 Charset::Windows1252
644 );
645
646 assert_eq!(
648 detector.parse_charset_name("unknown-encoding"),
649 Charset::Utf8
650 );
651 }
652
653 #[test]
655 fn test_config_integration() {
656 use crate::config::Config;
657
658 let mut config = Config::default();
660 config.formats.default_encoding = "gbk".to_string();
661 config.formats.encoding_detection_confidence = 0.9;
662
663 let detector = EncodingDetector::new(&config);
664
665 assert_eq!(detector.default_encoding, "gbk");
667 assert_eq!(detector.confidence_threshold, 0.9);
668
669 let ambiguous_data = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F]; let result = detector.detect_encoding(&ambiguous_data).unwrap();
672
673 if result.confidence < 0.9 {
675 assert_eq!(result.charset, Charset::Gbk);
676 }
677 }
678}