1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use std::fs::File;
4use std::io::Read;
5
6pub struct EncodingDetector {
8 confidence_threshold: f32,
9 max_sample_size: usize,
10 supported_charsets: Vec<Charset>,
11 default_encoding: String,
12}
13
14impl EncodingDetector {
15 pub fn new(config: &crate::config::Config) -> Self {
17 Self {
18 confidence_threshold: config.formats.encoding_detection_confidence,
19 max_sample_size: 8192,
20 supported_charsets: Self::default_charsets(),
21 default_encoding: config.formats.default_encoding.clone(),
22 }
23 }
24
25 pub fn with_defaults() -> Self {
27 Self {
28 confidence_threshold: 0.8, max_sample_size: 8192,
30 supported_charsets: Self::default_charsets(),
31 default_encoding: "utf-8".to_string(),
32 }
33 }
34
35 pub fn with_config(config: &crate::config::Config) -> Self {
37 Self {
38 confidence_threshold: config.formats.encoding_detection_confidence,
39 max_sample_size: 8192,
40 supported_charsets: Self::default_charsets(),
41 default_encoding: config.formats.default_encoding.clone(),
42 }
43 }
44
45 pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
47 crate::core::fs_util::check_file_size(
48 std::path::Path::new(file_path),
49 52_428_800,
50 "Subtitle",
51 )?;
52 let mut file = File::open(file_path)?;
53 let mut buffer = vec![0; self.max_sample_size];
54 let bytes_read = file.read(&mut buffer)?;
55 buffer.truncate(bytes_read);
56 self.detect_encoding(&buffer)
57 }
58
59 pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
61 if let Some(encoding) = self.detect_bom(data) {
62 return Ok(encoding);
63 }
64 let candidates = self.analyze_byte_patterns(data)?;
65 self.select_best_encoding(candidates, data)
66 }
67
68 fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
69 if data.len() < 3 {
70 return None;
71 }
72 match &data[0..3] {
73 [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
74 charset: Charset::Utf8,
75 confidence: 1.0,
76 bom_detected: true,
77 sample_text: String::from("UTF-8 with BOM"),
78 }),
79 [0xFF, 0xFE, ..] => Some(EncodingInfo {
80 charset: Charset::Utf16Le,
81 confidence: 1.0,
82 bom_detected: true,
83 sample_text: String::from("UTF-16 LE with BOM"),
84 }),
85 [0xFE, 0xFF, ..] => Some(EncodingInfo {
86 charset: Charset::Utf16Be,
87 confidence: 1.0,
88 bom_detected: true,
89 sample_text: String::from("UTF-16 BE with BOM"),
90 }),
91 _ => {
92 if data.len() >= 4 {
93 match &data[0..4] {
94 [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
95 charset: Charset::Utf32Le,
96 confidence: 1.0,
97 bom_detected: true,
98 sample_text: String::from("UTF-32 LE with BOM"),
99 }),
100 [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
101 charset: Charset::Utf32Be,
102 confidence: 1.0,
103 bom_detected: true,
104 sample_text: String::from("UTF-32 BE with BOM"),
105 }),
106 _ => None,
107 }
108 } else {
109 None
110 }
111 }
112 }
113 }
114
115 fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
116 let mut candidates = Vec::new();
117 for charset in &self.supported_charsets {
118 let confidence = self.calculate_encoding_confidence(data, charset)?;
119 if confidence > 0.1 {
120 candidates.push(EncodingCandidate {
121 charset: charset.clone(),
122 confidence,
123 });
124 }
125 }
126 candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
127 Ok(candidates)
128 }
129
130 fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
131 match charset {
132 Charset::Utf8 => self.check_utf8_validity(data),
133 Charset::Gbk => self.check_gbk_patterns(data),
134 Charset::ShiftJis => self.check_shift_jis_patterns(data),
135 Charset::Big5 => self.check_big5_patterns(data),
136 Charset::Iso88591 => self.check_iso88591_patterns(data),
137 Charset::Windows1252 => self.check_windows1252_patterns(data),
138 _ => Ok(0.0),
139 }
140 }
141
142 fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
143 let mut valid_chars = 0;
144 let mut total_chars = 0;
145 let mut i = 0;
146
147 while i < data.len() {
148 total_chars += 1;
149 if data[i] & 0x80 == 0 {
150 valid_chars += 1;
151 i += 1;
152 } else if data[i] & 0xE0 == 0xC0 {
153 if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
154 valid_chars += 1;
155 }
156 i += 2;
157 } else if data[i] & 0xF0 == 0xE0 {
158 if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
159 valid_chars += 1;
160 }
161 i += 3;
162 } else if data[i] & 0xF8 == 0xF0 {
163 if i + 3 < data.len()
164 && data[i + 1] & 0xC0 == 0x80
165 && data[i + 2] & 0xC0 == 0x80
166 && data[i + 3] & 0xC0 == 0x80
167 {
168 valid_chars += 1;
169 }
170 i += 4;
171 } else {
172 i += 1;
173 }
174 }
175
176 Ok(if total_chars > 0 {
177 valid_chars as f32 / total_chars as f32
178 } else {
179 0.0
180 })
181 }
182
183 fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
184 let mut valid_chars = 0;
185 let mut total_chars = 0;
186 let mut i = 0;
187
188 while i < data.len() {
189 if data[i] < 0x80 {
190 valid_chars += 1;
191 total_chars += 1;
192 i += 1;
193 } else if i + 1 < data.len() {
194 let byte1 = data[i];
195 let byte2 = data[i + 1];
196 if (0x81..=0xFE).contains(&byte1)
197 && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
198 {
199 valid_chars += 1;
200 }
201 total_chars += 1;
202 i += 2;
203 } else {
204 total_chars += 1;
205 i += 1;
206 }
207 }
208
209 Ok(if total_chars > 0 {
210 valid_chars as f32 / total_chars as f32
211 } else {
212 0.0
213 })
214 }
215
216 fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
217 let mut valid_chars = 0;
218 let mut total_chars = 0;
219 let mut i = 0;
220
221 while i < data.len() {
222 if data[i] < 0x80 {
223 valid_chars += 1;
224 total_chars += 1;
225 i += 1;
226 } else if i + 1 < data.len() {
227 let byte1 = data[i];
228 let byte2 = data[i + 1];
229 if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
230 && (0x40..=0xFC).contains(&byte2)
231 && byte2 != 0x7F
232 {
233 valid_chars += 1;
234 }
235 total_chars += 1;
236 i += 2;
237 } else {
238 total_chars += 1;
239 i += 1;
240 }
241 }
242
243 Ok(if total_chars > 0 {
244 valid_chars as f32 / total_chars as f32
245 } else {
246 0.0
247 })
248 }
249
250 fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
251 let mut valid_chars = 0;
252 let mut total_chars = 0;
253 let mut i = 0;
254
255 while i < data.len() {
256 if data[i] < 0x80 {
257 valid_chars += 1;
258 total_chars += 1;
259 i += 1;
260 } else if i + 1 < data.len() {
261 let byte1 = data[i];
262 let byte2 = data[i + 1];
263 if (0xA1..=0xFE).contains(&byte1)
264 && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
265 {
266 valid_chars += 1;
267 }
268 total_chars += 1;
269 i += 2;
270 } else {
271 total_chars += 1;
272 i += 1;
273 }
274 }
275
276 Ok(if total_chars > 0 {
277 valid_chars as f32 / total_chars as f32
278 } else {
279 0.0
280 })
281 }
282
283 fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
284 let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
285 let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
286 if extended_count > 0 {
287 let utf8_conf = self.check_utf8_validity(data)?;
288 Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
289 } else {
290 Ok(0.5)
291 }
292 }
293
294 fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
295 let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
296 let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
297 if control_chars > 0 || extended_chars > 0 {
298 let utf8_conf = self.check_utf8_validity(data)?;
299 Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
300 } else {
301 Ok(0.3)
302 }
303 }
304
305 fn select_best_encoding(
306 &self,
307 candidates: Vec<EncodingCandidate>,
308 data: &[u8],
309 ) -> Result<EncodingInfo> {
310 if candidates.is_empty() {
311 let default_charset = self.parse_charset_name(&self.default_encoding);
312 let sample = self.decode_sample(data, &default_charset)?;
313 return Ok(EncodingInfo {
314 charset: default_charset,
315 confidence: 0.1,
316 bom_detected: false,
317 sample_text: format!(
318 "Unable to detect encoding, using default: {} (sample: {})",
319 self.default_encoding,
320 sample.chars().take(50).collect::<String>()
321 ),
322 });
323 }
324 let best = &candidates[0];
325 if best.confidence < self.confidence_threshold {
326 let default_charset = self.parse_charset_name(&self.default_encoding);
327 let sample = self.decode_sample(data, &default_charset)?;
328 return Ok(EncodingInfo {
329 charset: default_charset,
330 confidence: 0.5,
331 bom_detected: false,
332 sample_text: format!(
333 "Low confidence detection, using default: {} (sample: {})",
334 self.default_encoding,
335 sample.chars().take(50).collect::<String>()
336 ),
337 });
338 }
339 let sample = self.decode_sample(data, &best.charset)?;
340 Ok(EncodingInfo {
341 charset: best.charset.clone(),
342 confidence: best.confidence,
343 bom_detected: false,
344 sample_text: sample,
345 })
346 }
347
348 fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
349 let sample_size = data.len().min(200);
350 let sample_data = &data[0..sample_size];
351 match charset {
352 Charset::Utf8 => String::from_utf8(sample_data.to_vec())
353 .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
354 _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
355 }
356 }
357
358 fn default_charsets() -> Vec<Charset> {
359 vec![
360 Charset::Utf8,
361 Charset::Gbk,
362 Charset::ShiftJis,
363 Charset::Big5,
364 Charset::Iso88591,
365 Charset::Windows1252,
366 ]
367 }
368
369 fn parse_charset_name(&self, encoding_name: &str) -> Charset {
371 match encoding_name.to_lowercase().as_str() {
372 "utf-8" | "utf8" => Charset::Utf8,
373 "utf-16le" | "utf16le" => Charset::Utf16Le,
374 "utf-16be" | "utf16be" => Charset::Utf16Be,
375 "utf-32le" | "utf32le" => Charset::Utf32Le,
376 "utf-32be" | "utf32be" => Charset::Utf32Be,
377 "gbk" | "gb2312" => Charset::Gbk,
378 "shift-jis" | "shift_jis" | "sjis" => Charset::ShiftJis,
379 "iso-8859-1" | "iso88591" | "latin1" => Charset::Iso88591,
380 "windows-1252" | "windows1252" | "cp1252" => Charset::Windows1252,
381 "big5" => Charset::Big5,
382 "euc-kr" | "euckr" => Charset::Euckr,
383 _ => Charset::Utf8, }
385 }
386}
387
388#[derive(Debug, Clone)]
389struct EncodingCandidate {
390 charset: Charset,
391 confidence: f32,
392}
393
394impl Default for EncodingDetector {
395 fn default() -> Self {
396 Self::with_defaults()
397 }
398}
399
400#[cfg(test)]
401mod tests {
402 use super::*;
403 use std::fs;
404 use tempfile::TempDir;
405
406 fn create_test_detector() -> EncodingDetector {
407 EncodingDetector {
408 confidence_threshold: 0.7,
409 max_sample_size: 8192,
410 supported_charsets: EncodingDetector::default_charsets(),
411 default_encoding: "utf-8".to_string(),
412 }
413 }
414
415 #[test]
417 fn test_utf8_detection_accuracy() {
418 let detector = create_test_detector();
419 let utf8_text = "Hello, 世界! Bonjour, monde! 🌍";
420
421 let result = detector.detect_encoding(utf8_text.as_bytes()).unwrap();
422
423 assert_eq!(result.charset, Charset::Utf8);
424 assert!(result.confidence > 0.8);
425 assert!(!result.bom_detected);
426 assert!(result.sample_text.contains("Hello"));
427 }
428
429 #[test]
431 fn test_utf8_bom_detection() {
432 let detector = create_test_detector();
433 let mut bom_data = vec![0xEF, 0xBB, 0xBF]; bom_data.extend_from_slice("Hello, World!".as_bytes());
435
436 let result = detector.detect_encoding(&bom_data).unwrap();
437
438 assert_eq!(result.charset, Charset::Utf8);
439 assert_eq!(result.confidence, 1.0);
440 assert!(result.bom_detected);
441 assert_eq!(result.sample_text, "UTF-8 with BOM");
442 }
443
444 #[test]
446 fn test_utf16_bom_detection() {
447 let detector = create_test_detector();
448
449 let utf16le_data = vec![0xFF, 0xFE, 0x48, 0x00, 0x65, 0x00]; let result = detector.detect_encoding(&utf16le_data).unwrap();
452 assert_eq!(result.charset, Charset::Utf16Le);
453 assert!(result.bom_detected);
454
455 let utf16be_data = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65]; let result = detector.detect_encoding(&utf16be_data).unwrap();
458 assert_eq!(result.charset, Charset::Utf16Be);
459 assert!(result.bom_detected);
460 }
461
462 #[test]
464 fn test_file_encoding_detection() {
465 let detector = create_test_detector();
466 let temp_dir = TempDir::new().unwrap();
467
468 let utf8_path = temp_dir.path().join("utf8.txt");
470 fs::write(&utf8_path, "測試檔案編碼檢測功能。").unwrap();
471
472 let result = detector
473 .detect_file_encoding(utf8_path.to_str().unwrap())
474 .unwrap();
475
476 assert_eq!(result.charset, Charset::Utf8);
477 assert!(result.confidence > 0.7);
478 }
479
480 #[test]
482 fn test_nonexistent_file_error() {
483 let detector = create_test_detector();
484 let result = detector.detect_file_encoding("nonexistent.txt");
485
486 assert!(result.is_err());
487 }
488
489 #[test]
491 fn test_gbk_pattern_detection() {
492 let detector = create_test_detector();
493
494 let gbk_pattern = vec![
496 0xC4, 0xE3, 0xBA, 0xC3, 0xCA, 0xC0, 0xBD, 0xE7, ];
499
500 let result = detector.detect_encoding(&gbk_pattern).unwrap();
501
502 assert!(result.confidence > 0.3);
504 if result.charset == Charset::Gbk {
505 assert!(result.confidence > 0.5);
506 }
507 }
508
509 #[test]
511 fn test_shift_jis_detection() {
512 let detector = create_test_detector();
513
514 let shift_jis_pattern = vec![
516 0x82, 0xB1, 0x82, 0xF1, 0x82, 0xB1, 0x82, 0xF1, 0x82, 0xC9, 0x82, 0xBF, ];
520
521 let result = detector.detect_encoding(&shift_jis_pattern).unwrap();
522
523 assert!(result.confidence > 0.2);
525 }
526
527 #[test]
529 fn test_encoding_confidence_ranking() {
530 let detector = create_test_detector();
531
532 let clear_utf8 = "Clear English text with numbers 123.";
534 let utf8_result = detector.detect_encoding(clear_utf8.as_bytes()).unwrap();
535
536 let ambiguous_data: Vec<u8> = (0x80..=0xFF).cycle().take(50).collect();
538 let ambiguous_result = detector.detect_encoding(&ambiguous_data).unwrap();
539
540 assert!(utf8_result.confidence > ambiguous_result.confidence);
541 }
542
543 #[test]
545 fn test_max_sample_size_limit() {
546 let detector = create_test_detector();
547
548 let large_data = vec![b'A'; 10000]; let result = detector.detect_encoding(&large_data).unwrap();
551
552 assert_eq!(result.charset, Charset::Utf8);
554 assert!(result.confidence > 0.9);
555 }
556
557 #[test]
559 fn test_encoding_candidate_selection() {
560 let detector = create_test_detector();
561
562 let mut mixed_data = b"English text ".to_vec();
564 mixed_data.extend_from_slice(&[0xC3, 0xA9]); mixed_data.extend_from_slice(b" and more text");
566
567 let result = detector.detect_encoding(&mixed_data).unwrap();
568
569 assert_eq!(result.charset, Charset::Utf8);
571 assert!(result.confidence > 0.7);
572 }
573
574 #[test]
576 fn test_unknown_encoding_fallback() {
577 let detector = create_test_detector();
578
579 let random_data: Vec<u8> = (0..100).map(|i| (i * 7 + 13) as u8).collect();
581 let result = detector.detect_encoding(&random_data).unwrap();
582
583 assert!(result.confidence >= 0.0);
585 assert!(result.confidence <= 1.0);
586 }
587
588 #[test]
590 fn test_detection_performance() {
591 let detector = create_test_detector();
592
593 let large_text = "Hello, World! ".repeat(500);
595
596 let start = std::time::Instant::now();
597 let _result = detector.detect_encoding(large_text.as_bytes()).unwrap();
598 let duration = start.elapsed();
599
600 assert!(duration.as_millis() < 100);
602 }
603
604 #[test]
606 fn test_default_encoding_usage() {
607 let mut detector = EncodingDetector {
609 confidence_threshold: 0.95, max_sample_size: 8192,
611 supported_charsets: EncodingDetector::default_charsets(),
612 default_encoding: "gbk".to_string(),
613 };
614
615 let ambiguous_data = vec![0x80, 0x81, 0x82, 0x83, 0x84, 0x85];
618 let result = detector.detect_encoding(&ambiguous_data).unwrap();
619
620 assert_eq!(result.charset, Charset::Gbk);
622 assert!(result.sample_text.contains("gbk") || result.sample_text.contains("default"));
623 assert!(result.confidence < 0.95); detector.default_encoding = "utf-16le".to_string();
627 let result = detector.detect_encoding(&ambiguous_data).unwrap();
628 assert_eq!(result.charset, Charset::Utf16Le);
629 assert!(result.sample_text.contains("utf-16le") || result.sample_text.contains("default"));
630 }
631
632 #[test]
634 fn test_encoding_name_parsing() {
635 let detector = create_test_detector();
636
637 assert_eq!(detector.parse_charset_name("utf-8"), Charset::Utf8);
639 assert_eq!(detector.parse_charset_name("UTF8"), Charset::Utf8);
640 assert_eq!(detector.parse_charset_name("gbk"), Charset::Gbk);
641 assert_eq!(detector.parse_charset_name("GBK"), Charset::Gbk);
642 assert_eq!(detector.parse_charset_name("shift-jis"), Charset::ShiftJis);
643 assert_eq!(detector.parse_charset_name("SHIFT_JIS"), Charset::ShiftJis);
644 assert_eq!(detector.parse_charset_name("big5"), Charset::Big5);
645 assert_eq!(detector.parse_charset_name("iso-8859-1"), Charset::Iso88591);
646 assert_eq!(
647 detector.parse_charset_name("windows-1252"),
648 Charset::Windows1252
649 );
650
651 assert_eq!(
653 detector.parse_charset_name("unknown-encoding"),
654 Charset::Utf8
655 );
656 }
657
658 #[test]
660 fn test_config_integration() {
661 use crate::config::Config;
662
663 let mut config = Config::default();
665 config.formats.default_encoding = "gbk".to_string();
666 config.formats.encoding_detection_confidence = 0.9;
667
668 let detector = EncodingDetector::new(&config);
669
670 assert_eq!(detector.default_encoding, "gbk");
672 assert_eq!(detector.confidence_threshold, 0.9);
673
674 let ambiguous_data = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F]; let result = detector.detect_encoding(&ambiguous_data).unwrap();
677
678 if result.confidence < 0.9 {
680 assert_eq!(result.charset, Charset::Gbk);
681 }
682 }
683}