Skip to main content

subx_cli/core/formats/encoding/
detector.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use std::fs::File;
4use std::io::Read;
5
6/// Encoding detection engine
7pub struct EncodingDetector {
8    confidence_threshold: f32,
9    max_sample_size: usize,
10    supported_charsets: Vec<Charset>,
11    default_encoding: String,
12}
13
14impl EncodingDetector {
15    /// Create encoding detector with configuration
16    pub fn new(config: &crate::config::Config) -> Self {
17        Self {
18            confidence_threshold: config.formats.encoding_detection_confidence,
19            max_sample_size: 8192,
20            supported_charsets: Self::default_charsets(),
21            default_encoding: config.formats.default_encoding.clone(),
22        }
23    }
24
25    /// Create encoding detector with default configuration
26    pub fn with_defaults() -> Self {
27        Self {
28            confidence_threshold: 0.8, // Default confidence threshold
29            max_sample_size: 8192,
30            supported_charsets: Self::default_charsets(),
31            default_encoding: "utf-8".to_string(),
32        }
33    }
34
35    /// Create encoding detector with custom configuration
36    pub fn with_config(config: &crate::config::Config) -> Self {
37        Self {
38            confidence_threshold: config.formats.encoding_detection_confidence,
39            max_sample_size: 8192,
40            supported_charsets: Self::default_charsets(),
41            default_encoding: config.formats.default_encoding.clone(),
42        }
43    }
44
45    /// Detect file encoding
46    pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
47        crate::core::fs_util::check_file_size(
48            std::path::Path::new(file_path),
49            52_428_800,
50            "Subtitle",
51        )?;
52        let mut file = File::open(file_path)?;
53        let mut buffer = vec![0; self.max_sample_size];
54        let bytes_read = file.read(&mut buffer)?;
55        buffer.truncate(bytes_read);
56        self.detect_encoding(&buffer)
57    }
58
59    /// Detect data encoding
60    pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
61        if let Some(encoding) = self.detect_bom(data) {
62            return Ok(encoding);
63        }
64        let candidates = self.analyze_byte_patterns(data)?;
65        self.select_best_encoding(candidates, data)
66    }
67
68    fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
69        if data.len() < 3 {
70            return None;
71        }
72        match &data[0..3] {
73            [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
74                charset: Charset::Utf8,
75                confidence: 1.0,
76                bom_detected: true,
77                sample_text: String::from("UTF-8 with BOM"),
78            }),
79            [0xFF, 0xFE, ..] => Some(EncodingInfo {
80                charset: Charset::Utf16Le,
81                confidence: 1.0,
82                bom_detected: true,
83                sample_text: String::from("UTF-16 LE with BOM"),
84            }),
85            [0xFE, 0xFF, ..] => Some(EncodingInfo {
86                charset: Charset::Utf16Be,
87                confidence: 1.0,
88                bom_detected: true,
89                sample_text: String::from("UTF-16 BE with BOM"),
90            }),
91            _ => {
92                if data.len() >= 4 {
93                    match &data[0..4] {
94                        [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
95                            charset: Charset::Utf32Le,
96                            confidence: 1.0,
97                            bom_detected: true,
98                            sample_text: String::from("UTF-32 LE with BOM"),
99                        }),
100                        [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
101                            charset: Charset::Utf32Be,
102                            confidence: 1.0,
103                            bom_detected: true,
104                            sample_text: String::from("UTF-32 BE with BOM"),
105                        }),
106                        _ => None,
107                    }
108                } else {
109                    None
110                }
111            }
112        }
113    }
114
115    fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
116        let mut candidates = Vec::new();
117        for charset in &self.supported_charsets {
118            let confidence = self.calculate_encoding_confidence(data, charset)?;
119            if confidence > 0.1 {
120                candidates.push(EncodingCandidate {
121                    charset: charset.clone(),
122                    confidence,
123                });
124            }
125        }
126        candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
127        Ok(candidates)
128    }
129
130    fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
131        match charset {
132            Charset::Utf8 => self.check_utf8_validity(data),
133            Charset::Gbk => self.check_gbk_patterns(data),
134            Charset::ShiftJis => self.check_shift_jis_patterns(data),
135            Charset::Big5 => self.check_big5_patterns(data),
136            Charset::Iso88591 => self.check_iso88591_patterns(data),
137            Charset::Windows1252 => self.check_windows1252_patterns(data),
138            _ => Ok(0.0),
139        }
140    }
141
142    fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
143        let mut valid_chars = 0;
144        let mut total_chars = 0;
145        let mut i = 0;
146
147        while i < data.len() {
148            total_chars += 1;
149            if data[i] & 0x80 == 0 {
150                valid_chars += 1;
151                i += 1;
152            } else if data[i] & 0xE0 == 0xC0 {
153                if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
154                    valid_chars += 1;
155                }
156                i += 2;
157            } else if data[i] & 0xF0 == 0xE0 {
158                if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
159                    valid_chars += 1;
160                }
161                i += 3;
162            } else if data[i] & 0xF8 == 0xF0 {
163                if i + 3 < data.len()
164                    && data[i + 1] & 0xC0 == 0x80
165                    && data[i + 2] & 0xC0 == 0x80
166                    && data[i + 3] & 0xC0 == 0x80
167                {
168                    valid_chars += 1;
169                }
170                i += 4;
171            } else {
172                i += 1;
173            }
174        }
175
176        Ok(if total_chars > 0 {
177            valid_chars as f32 / total_chars as f32
178        } else {
179            0.0
180        })
181    }
182
183    fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
184        let mut valid_chars = 0;
185        let mut total_chars = 0;
186        let mut i = 0;
187
188        while i < data.len() {
189            if data[i] < 0x80 {
190                valid_chars += 1;
191                total_chars += 1;
192                i += 1;
193            } else if i + 1 < data.len() {
194                let byte1 = data[i];
195                let byte2 = data[i + 1];
196                if (0x81..=0xFE).contains(&byte1)
197                    && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
198                {
199                    valid_chars += 1;
200                }
201                total_chars += 1;
202                i += 2;
203            } else {
204                total_chars += 1;
205                i += 1;
206            }
207        }
208
209        Ok(if total_chars > 0 {
210            valid_chars as f32 / total_chars as f32
211        } else {
212            0.0
213        })
214    }
215
216    fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
217        let mut valid_chars = 0;
218        let mut total_chars = 0;
219        let mut i = 0;
220
221        while i < data.len() {
222            if data[i] < 0x80 {
223                valid_chars += 1;
224                total_chars += 1;
225                i += 1;
226            } else if i + 1 < data.len() {
227                let byte1 = data[i];
228                let byte2 = data[i + 1];
229                if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
230                    && (0x40..=0xFC).contains(&byte2)
231                    && byte2 != 0x7F
232                {
233                    valid_chars += 1;
234                }
235                total_chars += 1;
236                i += 2;
237            } else {
238                total_chars += 1;
239                i += 1;
240            }
241        }
242
243        Ok(if total_chars > 0 {
244            valid_chars as f32 / total_chars as f32
245        } else {
246            0.0
247        })
248    }
249
250    fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
251        let mut valid_chars = 0;
252        let mut total_chars = 0;
253        let mut i = 0;
254
255        while i < data.len() {
256            if data[i] < 0x80 {
257                valid_chars += 1;
258                total_chars += 1;
259                i += 1;
260            } else if i + 1 < data.len() {
261                let byte1 = data[i];
262                let byte2 = data[i + 1];
263                if (0xA1..=0xFE).contains(&byte1)
264                    && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
265                {
266                    valid_chars += 1;
267                }
268                total_chars += 1;
269                i += 2;
270            } else {
271                total_chars += 1;
272                i += 1;
273            }
274        }
275
276        Ok(if total_chars > 0 {
277            valid_chars as f32 / total_chars as f32
278        } else {
279            0.0
280        })
281    }
282
283    fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
284        let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
285        let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
286        if extended_count > 0 {
287            let utf8_conf = self.check_utf8_validity(data)?;
288            Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
289        } else {
290            Ok(0.5)
291        }
292    }
293
294    fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
295        let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
296        let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
297        if control_chars > 0 || extended_chars > 0 {
298            let utf8_conf = self.check_utf8_validity(data)?;
299            Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
300        } else {
301            Ok(0.3)
302        }
303    }
304
305    fn select_best_encoding(
306        &self,
307        candidates: Vec<EncodingCandidate>,
308        data: &[u8],
309    ) -> Result<EncodingInfo> {
310        if candidates.is_empty() {
311            let default_charset = self.parse_charset_name(&self.default_encoding);
312            let sample = self.decode_sample(data, &default_charset)?;
313            return Ok(EncodingInfo {
314                charset: default_charset,
315                confidence: 0.1,
316                bom_detected: false,
317                sample_text: format!(
318                    "Unable to detect encoding, using default: {} (sample: {})",
319                    self.default_encoding,
320                    sample.chars().take(50).collect::<String>()
321                ),
322            });
323        }
324        let best = &candidates[0];
325        if best.confidence < self.confidence_threshold {
326            let default_charset = self.parse_charset_name(&self.default_encoding);
327            let sample = self.decode_sample(data, &default_charset)?;
328            return Ok(EncodingInfo {
329                charset: default_charset,
330                confidence: 0.5,
331                bom_detected: false,
332                sample_text: format!(
333                    "Low confidence detection, using default: {} (sample: {})",
334                    self.default_encoding,
335                    sample.chars().take(50).collect::<String>()
336                ),
337            });
338        }
339        let sample = self.decode_sample(data, &best.charset)?;
340        Ok(EncodingInfo {
341            charset: best.charset.clone(),
342            confidence: best.confidence,
343            bom_detected: false,
344            sample_text: sample,
345        })
346    }
347
348    fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
349        let sample_size = data.len().min(200);
350        let sample_data = &data[0..sample_size];
351        match charset {
352            Charset::Utf8 => String::from_utf8(sample_data.to_vec())
353                .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
354            _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
355        }
356    }
357
358    fn default_charsets() -> Vec<Charset> {
359        vec![
360            Charset::Utf8,
361            Charset::Gbk,
362            Charset::ShiftJis,
363            Charset::Big5,
364            Charset::Iso88591,
365            Charset::Windows1252,
366        ]
367    }
368
369    /// Convert encoding name string to Charset enum
370    fn parse_charset_name(&self, encoding_name: &str) -> Charset {
371        match encoding_name.to_lowercase().as_str() {
372            "utf-8" | "utf8" => Charset::Utf8,
373            "utf-16le" | "utf16le" => Charset::Utf16Le,
374            "utf-16be" | "utf16be" => Charset::Utf16Be,
375            "utf-32le" | "utf32le" => Charset::Utf32Le,
376            "utf-32be" | "utf32be" => Charset::Utf32Be,
377            "gbk" | "gb2312" => Charset::Gbk,
378            "shift-jis" | "shift_jis" | "sjis" => Charset::ShiftJis,
379            "iso-8859-1" | "iso88591" | "latin1" => Charset::Iso88591,
380            "windows-1252" | "windows1252" | "cp1252" => Charset::Windows1252,
381            "big5" => Charset::Big5,
382            "euc-kr" | "euckr" => Charset::Euckr,
383            _ => Charset::Utf8, // Default fallback
384        }
385    }
386}
387
388#[derive(Debug, Clone)]
389struct EncodingCandidate {
390    charset: Charset,
391    confidence: f32,
392}
393
394impl Default for EncodingDetector {
395    fn default() -> Self {
396        Self::with_defaults()
397    }
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403    use std::fs;
404    use tempfile::TempDir;
405
406    fn create_test_detector() -> EncodingDetector {
407        EncodingDetector {
408            confidence_threshold: 0.7,
409            max_sample_size: 8192,
410            supported_charsets: EncodingDetector::default_charsets(),
411            default_encoding: "utf-8".to_string(),
412        }
413    }
414
415    /// Test UTF-8 encoding detection
416    #[test]
417    fn test_utf8_detection_accuracy() {
418        let detector = create_test_detector();
419        let utf8_text = "Hello, 世界! Bonjour, monde! 🌍";
420
421        let result = detector.detect_encoding(utf8_text.as_bytes()).unwrap();
422
423        assert_eq!(result.charset, Charset::Utf8);
424        assert!(result.confidence > 0.8);
425        assert!(!result.bom_detected);
426        assert!(result.sample_text.contains("Hello"));
427    }
428
429    /// Test UTF-8 BOM detection
430    #[test]
431    fn test_utf8_bom_detection() {
432        let detector = create_test_detector();
433        let mut bom_data = vec![0xEF, 0xBB, 0xBF]; // UTF-8 BOM
434        bom_data.extend_from_slice("Hello, World!".as_bytes());
435
436        let result = detector.detect_encoding(&bom_data).unwrap();
437
438        assert_eq!(result.charset, Charset::Utf8);
439        assert_eq!(result.confidence, 1.0);
440        assert!(result.bom_detected);
441        assert_eq!(result.sample_text, "UTF-8 with BOM");
442    }
443
444    /// Test UTF-16 BOM detection
445    #[test]
446    fn test_utf16_bom_detection() {
447        let detector = create_test_detector();
448
449        // UTF-16 LE BOM
450        let utf16le_data = vec![0xFF, 0xFE, 0x48, 0x00, 0x65, 0x00]; // "He" in UTF-16 LE
451        let result = detector.detect_encoding(&utf16le_data).unwrap();
452        assert_eq!(result.charset, Charset::Utf16Le);
453        assert!(result.bom_detected);
454
455        // UTF-16 BE BOM
456        let utf16be_data = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65]; // "He" in UTF-16 BE
457        let result = detector.detect_encoding(&utf16be_data).unwrap();
458        assert_eq!(result.charset, Charset::Utf16Be);
459        assert!(result.bom_detected);
460    }
461
462    /// Test file encoding detection
463    #[test]
464    fn test_file_encoding_detection() {
465        let detector = create_test_detector();
466        let temp_dir = TempDir::new().unwrap();
467
468        // Create UTF-8 file
469        let utf8_path = temp_dir.path().join("utf8.txt");
470        fs::write(&utf8_path, "測試檔案編碼檢測功能。").unwrap();
471
472        let result = detector
473            .detect_file_encoding(utf8_path.to_str().unwrap())
474            .unwrap();
475
476        assert_eq!(result.charset, Charset::Utf8);
477        assert!(result.confidence > 0.7);
478    }
479
480    /// Test error handling for non-existent files
481    #[test]
482    fn test_nonexistent_file_error() {
483        let detector = create_test_detector();
484        let result = detector.detect_file_encoding("nonexistent.txt");
485
486        assert!(result.is_err());
487    }
488
489    /// Test GBK encoding pattern detection
490    #[test]
491    fn test_gbk_pattern_detection() {
492        let detector = create_test_detector();
493
494        // Simulate GBK encoding pattern (high byte range)
495        let gbk_pattern = vec![
496            0xC4, 0xE3, 0xBA, 0xC3, // "Hello" in GBK encoding
497            0xCA, 0xC0, 0xBD, 0xE7, // "World" in GBK encoding
498        ];
499
500        let result = detector.detect_encoding(&gbk_pattern).unwrap();
501
502        // Should detect as GBK or at least not UTF-8
503        assert!(result.confidence > 0.3);
504        if result.charset == Charset::Gbk {
505            assert!(result.confidence > 0.5);
506        }
507    }
508
509    /// Test Shift-JIS encoding detection
510    #[test]
511    fn test_shift_jis_detection() {
512        let detector = create_test_detector();
513
514        // Simulate Shift-JIS encoding pattern
515        let shift_jis_pattern = vec![
516            0x82, 0xB1, 0x82, 0xF1, // こん (Shift-JIS)
517            0x82, 0xB1, 0x82, 0xF1, // こん (Shift-JIS)
518            0x82, 0xC9, 0x82, 0xBF, // にち (Shift-JIS)
519        ];
520
521        let result = detector.detect_encoding(&shift_jis_pattern).unwrap();
522
523        // Should detect as Shift-JIS or related encoding
524        assert!(result.confidence > 0.2);
525    }
526
527    /// Test encoding confidence ranking
528    #[test]
529    fn test_encoding_confidence_ranking() {
530        let detector = create_test_detector();
531
532        // Clear UTF-8 text should have highest confidence
533        let clear_utf8 = "Clear English text with numbers 123.";
534        let utf8_result = detector.detect_encoding(clear_utf8.as_bytes()).unwrap();
535
536        // Ambiguous data should have lower confidence
537        let ambiguous_data: Vec<u8> = (0x80..=0xFF).cycle().take(50).collect();
538        let ambiguous_result = detector.detect_encoding(&ambiguous_data).unwrap();
539
540        assert!(utf8_result.confidence > ambiguous_result.confidence);
541    }
542
543    /// Test maximum sample size limit
544    #[test]
545    fn test_max_sample_size_limit() {
546        let detector = create_test_detector();
547
548        // Create data exceeding sample size limit
549        let large_data = vec![b'A'; 10000]; // Assuming limit is 8192
550        let result = detector.detect_encoding(&large_data).unwrap();
551
552        // Should successfully detect without failing due to data size
553        assert_eq!(result.charset, Charset::Utf8);
554        assert!(result.confidence > 0.9);
555    }
556
557    /// Test encoding candidate selection logic
558    #[test]
559    fn test_encoding_candidate_selection() {
560        let detector = create_test_detector();
561
562        // Create data with mixed encoding features
563        let mut mixed_data = b"English text ".to_vec();
564        mixed_data.extend_from_slice(&[0xC3, 0xA9]); // é in UTF-8
565        mixed_data.extend_from_slice(b" and more text");
566
567        let result = detector.detect_encoding(&mixed_data).unwrap();
568
569        // Should correctly choose UTF-8
570        assert_eq!(result.charset, Charset::Utf8);
571        assert!(result.confidence > 0.7);
572    }
573
574    /// Test fallback mechanism for unknown encodings
575    #[test]
576    fn test_unknown_encoding_fallback() {
577        let detector = create_test_detector();
578
579        // Create completely random data
580        let random_data: Vec<u8> = (0..100).map(|i| (i * 7 + 13) as u8).collect();
581        let result = detector.detect_encoding(&random_data).unwrap();
582
583        // Should have a fallback encoding choice
584        assert!(result.confidence >= 0.0);
585        assert!(result.confidence <= 1.0);
586    }
587
588    /// Test encoding detection performance
589    #[test]
590    fn test_detection_performance() {
591        let detector = create_test_detector();
592
593        // Create medium-sized text file
594        let large_text = "Hello, World! ".repeat(500);
595
596        let start = std::time::Instant::now();
597        let _result = detector.detect_encoding(large_text.as_bytes()).unwrap();
598        let duration = start.elapsed();
599
600        // Detection should complete within reasonable time (< 100ms)
601        assert!(duration.as_millis() < 100);
602    }
603
604    /// Test default encoding configuration usage
605    #[test]
606    fn test_default_encoding_usage() {
607        // Create detector with GBK as default encoding
608        let mut detector = EncodingDetector {
609            confidence_threshold: 0.95, // Very high threshold to force default usage
610            max_sample_size: 8192,
611            supported_charsets: EncodingDetector::default_charsets(),
612            default_encoding: "gbk".to_string(),
613        };
614
615        // Use truly ambiguous data that won't meet very high confidence threshold
616        // Mixed high-byte data that could be various encodings
617        let ambiguous_data = vec![0x80, 0x81, 0x82, 0x83, 0x84, 0x85];
618        let result = detector.detect_encoding(&ambiguous_data).unwrap();
619
620        // Should fall back to configured default encoding (GBK)
621        assert_eq!(result.charset, Charset::Gbk);
622        assert!(result.sample_text.contains("gbk") || result.sample_text.contains("default"));
623        assert!(result.confidence < 0.95); // Should be fallback confidence
624
625        // Test with UTF-16LE as default
626        detector.default_encoding = "utf-16le".to_string();
627        let result = detector.detect_encoding(&ambiguous_data).unwrap();
628        assert_eq!(result.charset, Charset::Utf16Le);
629        assert!(result.sample_text.contains("utf-16le") || result.sample_text.contains("default"));
630    }
631
632    /// Test encoding name parsing
633    #[test]
634    fn test_encoding_name_parsing() {
635        let detector = create_test_detector();
636
637        // Test various encoding name formats
638        assert_eq!(detector.parse_charset_name("utf-8"), Charset::Utf8);
639        assert_eq!(detector.parse_charset_name("UTF8"), Charset::Utf8);
640        assert_eq!(detector.parse_charset_name("gbk"), Charset::Gbk);
641        assert_eq!(detector.parse_charset_name("GBK"), Charset::Gbk);
642        assert_eq!(detector.parse_charset_name("shift-jis"), Charset::ShiftJis);
643        assert_eq!(detector.parse_charset_name("SHIFT_JIS"), Charset::ShiftJis);
644        assert_eq!(detector.parse_charset_name("big5"), Charset::Big5);
645        assert_eq!(detector.parse_charset_name("iso-8859-1"), Charset::Iso88591);
646        assert_eq!(
647            detector.parse_charset_name("windows-1252"),
648            Charset::Windows1252
649        );
650
651        // Test unknown encoding fallback
652        assert_eq!(
653            detector.parse_charset_name("unknown-encoding"),
654            Charset::Utf8
655        );
656    }
657
658    /// Test configuration integration
659    #[test]
660    fn test_config_integration() {
661        use crate::config::Config;
662
663        // Create config with custom default encoding
664        let mut config = Config::default();
665        config.formats.default_encoding = "gbk".to_string();
666        config.formats.encoding_detection_confidence = 0.9;
667
668        let detector = EncodingDetector::new(&config);
669
670        // Verify configuration was applied
671        assert_eq!(detector.default_encoding, "gbk");
672        assert_eq!(detector.confidence_threshold, 0.9);
673
674        // Test with low-confidence data
675        let ambiguous_data = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F]; // "Hello"
676        let result = detector.detect_encoding(&ambiguous_data).unwrap();
677
678        // Should use GBK as default when confidence is low
679        if result.confidence < 0.9 {
680            assert_eq!(result.charset, Charset::Gbk);
681        }
682    }
683}