subx_cli/core/formats/encoding/
detector.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use std::fs::File;
4use std::io::Read;
5
6/// Encoding detection engine
7pub struct EncodingDetector {
8    confidence_threshold: f32,
9    max_sample_size: usize,
10    supported_charsets: Vec<Charset>,
11    default_encoding: String,
12}
13
14impl EncodingDetector {
15    /// Create encoding detector with configuration
16    pub fn new(config: &crate::config::Config) -> Self {
17        Self {
18            confidence_threshold: config.formats.encoding_detection_confidence,
19            max_sample_size: 8192,
20            supported_charsets: Self::default_charsets(),
21            default_encoding: config.formats.default_encoding.clone(),
22        }
23    }
24
25    /// Create encoding detector with default configuration
26    pub fn with_defaults() -> Self {
27        Self {
28            confidence_threshold: 0.8, // Default confidence threshold
29            max_sample_size: 8192,
30            supported_charsets: Self::default_charsets(),
31            default_encoding: "utf-8".to_string(),
32        }
33    }
34
35    /// Create encoding detector with custom configuration
36    pub fn with_config(config: &crate::config::Config) -> Self {
37        Self {
38            confidence_threshold: config.formats.encoding_detection_confidence,
39            max_sample_size: 8192,
40            supported_charsets: Self::default_charsets(),
41            default_encoding: config.formats.default_encoding.clone(),
42        }
43    }
44
45    /// Detect file encoding
46    pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
47        let mut file = File::open(file_path)?;
48        let mut buffer = vec![0; self.max_sample_size];
49        let bytes_read = file.read(&mut buffer)?;
50        buffer.truncate(bytes_read);
51        self.detect_encoding(&buffer)
52    }
53
54    /// Detect data encoding
55    pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
56        if let Some(encoding) = self.detect_bom(data) {
57            return Ok(encoding);
58        }
59        let candidates = self.analyze_byte_patterns(data)?;
60        self.select_best_encoding(candidates, data)
61    }
62
63    fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
64        if data.len() < 3 {
65            return None;
66        }
67        match &data[0..3] {
68            [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
69                charset: Charset::Utf8,
70                confidence: 1.0,
71                bom_detected: true,
72                sample_text: String::from("UTF-8 with BOM"),
73            }),
74            [0xFF, 0xFE, ..] => Some(EncodingInfo {
75                charset: Charset::Utf16Le,
76                confidence: 1.0,
77                bom_detected: true,
78                sample_text: String::from("UTF-16 LE with BOM"),
79            }),
80            [0xFE, 0xFF, ..] => Some(EncodingInfo {
81                charset: Charset::Utf16Be,
82                confidence: 1.0,
83                bom_detected: true,
84                sample_text: String::from("UTF-16 BE with BOM"),
85            }),
86            _ => {
87                if data.len() >= 4 {
88                    match &data[0..4] {
89                        [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
90                            charset: Charset::Utf32Le,
91                            confidence: 1.0,
92                            bom_detected: true,
93                            sample_text: String::from("UTF-32 LE with BOM"),
94                        }),
95                        [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
96                            charset: Charset::Utf32Be,
97                            confidence: 1.0,
98                            bom_detected: true,
99                            sample_text: String::from("UTF-32 BE with BOM"),
100                        }),
101                        _ => None,
102                    }
103                } else {
104                    None
105                }
106            }
107        }
108    }
109
110    fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
111        let mut candidates = Vec::new();
112        for charset in &self.supported_charsets {
113            let confidence = self.calculate_encoding_confidence(data, charset)?;
114            if confidence > 0.1 {
115                candidates.push(EncodingCandidate {
116                    charset: charset.clone(),
117                    confidence,
118                });
119            }
120        }
121        candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
122        Ok(candidates)
123    }
124
125    fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
126        match charset {
127            Charset::Utf8 => self.check_utf8_validity(data),
128            Charset::Gbk => self.check_gbk_patterns(data),
129            Charset::ShiftJis => self.check_shift_jis_patterns(data),
130            Charset::Big5 => self.check_big5_patterns(data),
131            Charset::Iso88591 => self.check_iso88591_patterns(data),
132            Charset::Windows1252 => self.check_windows1252_patterns(data),
133            _ => Ok(0.0),
134        }
135    }
136
137    fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
138        let mut valid_chars = 0;
139        let mut total_chars = 0;
140        let mut i = 0;
141
142        while i < data.len() {
143            total_chars += 1;
144            if data[i] & 0x80 == 0 {
145                valid_chars += 1;
146                i += 1;
147            } else if data[i] & 0xE0 == 0xC0 {
148                if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
149                    valid_chars += 1;
150                }
151                i += 2;
152            } else if data[i] & 0xF0 == 0xE0 {
153                if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
154                    valid_chars += 1;
155                }
156                i += 3;
157            } else if data[i] & 0xF8 == 0xF0 {
158                if i + 3 < data.len()
159                    && data[i + 1] & 0xC0 == 0x80
160                    && data[i + 2] & 0xC0 == 0x80
161                    && data[i + 3] & 0xC0 == 0x80
162                {
163                    valid_chars += 1;
164                }
165                i += 4;
166            } else {
167                i += 1;
168            }
169        }
170
171        Ok(if total_chars > 0 {
172            valid_chars as f32 / total_chars as f32
173        } else {
174            0.0
175        })
176    }
177
178    fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
179        let mut valid_chars = 0;
180        let mut total_chars = 0;
181        let mut i = 0;
182
183        while i < data.len() {
184            if data[i] < 0x80 {
185                valid_chars += 1;
186                total_chars += 1;
187                i += 1;
188            } else if i + 1 < data.len() {
189                let byte1 = data[i];
190                let byte2 = data[i + 1];
191                if (0x81..=0xFE).contains(&byte1)
192                    && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
193                {
194                    valid_chars += 1;
195                }
196                total_chars += 1;
197                i += 2;
198            } else {
199                total_chars += 1;
200                i += 1;
201            }
202        }
203
204        Ok(if total_chars > 0 {
205            valid_chars as f32 / total_chars as f32
206        } else {
207            0.0
208        })
209    }
210
211    fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
212        let mut valid_chars = 0;
213        let mut total_chars = 0;
214        let mut i = 0;
215
216        while i < data.len() {
217            if data[i] < 0x80 {
218                valid_chars += 1;
219                total_chars += 1;
220                i += 1;
221            } else if i + 1 < data.len() {
222                let byte1 = data[i];
223                let byte2 = data[i + 1];
224                if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
225                    && (0x40..=0xFC).contains(&byte2)
226                    && byte2 != 0x7F
227                {
228                    valid_chars += 1;
229                }
230                total_chars += 1;
231                i += 2;
232            } else {
233                total_chars += 1;
234                i += 1;
235            }
236        }
237
238        Ok(if total_chars > 0 {
239            valid_chars as f32 / total_chars as f32
240        } else {
241            0.0
242        })
243    }
244
245    fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
246        let mut valid_chars = 0;
247        let mut total_chars = 0;
248        let mut i = 0;
249
250        while i < data.len() {
251            if data[i] < 0x80 {
252                valid_chars += 1;
253                total_chars += 1;
254                i += 1;
255            } else if i + 1 < data.len() {
256                let byte1 = data[i];
257                let byte2 = data[i + 1];
258                if (0xA1..=0xFE).contains(&byte1)
259                    && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
260                {
261                    valid_chars += 1;
262                }
263                total_chars += 1;
264                i += 2;
265            } else {
266                total_chars += 1;
267                i += 1;
268            }
269        }
270
271        Ok(if total_chars > 0 {
272            valid_chars as f32 / total_chars as f32
273        } else {
274            0.0
275        })
276    }
277
278    fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
279        let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
280        let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
281        if extended_count > 0 {
282            let utf8_conf = self.check_utf8_validity(data)?;
283            Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
284        } else {
285            Ok(0.5)
286        }
287    }
288
289    fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
290        let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
291        let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
292        if control_chars > 0 || extended_chars > 0 {
293            let utf8_conf = self.check_utf8_validity(data)?;
294            Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
295        } else {
296            Ok(0.3)
297        }
298    }
299
300    fn select_best_encoding(
301        &self,
302        candidates: Vec<EncodingCandidate>,
303        data: &[u8],
304    ) -> Result<EncodingInfo> {
305        if candidates.is_empty() {
306            let default_charset = self.parse_charset_name(&self.default_encoding);
307            let sample = self.decode_sample(data, &default_charset)?;
308            return Ok(EncodingInfo {
309                charset: default_charset,
310                confidence: 0.1,
311                bom_detected: false,
312                sample_text: format!(
313                    "Unable to detect encoding, using default: {} (sample: {})",
314                    self.default_encoding,
315                    sample.chars().take(50).collect::<String>()
316                ),
317            });
318        }
319        let best = &candidates[0];
320        if best.confidence < self.confidence_threshold {
321            let default_charset = self.parse_charset_name(&self.default_encoding);
322            let sample = self.decode_sample(data, &default_charset)?;
323            return Ok(EncodingInfo {
324                charset: default_charset,
325                confidence: 0.5,
326                bom_detected: false,
327                sample_text: format!(
328                    "Low confidence detection, using default: {} (sample: {})",
329                    self.default_encoding,
330                    sample.chars().take(50).collect::<String>()
331                ),
332            });
333        }
334        let sample = self.decode_sample(data, &best.charset)?;
335        Ok(EncodingInfo {
336            charset: best.charset.clone(),
337            confidence: best.confidence,
338            bom_detected: false,
339            sample_text: sample,
340        })
341    }
342
343    fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
344        let sample_size = data.len().min(200);
345        let sample_data = &data[0..sample_size];
346        match charset {
347            Charset::Utf8 => String::from_utf8(sample_data.to_vec())
348                .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
349            _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
350        }
351    }
352
353    fn default_charsets() -> Vec<Charset> {
354        vec![
355            Charset::Utf8,
356            Charset::Gbk,
357            Charset::ShiftJis,
358            Charset::Big5,
359            Charset::Iso88591,
360            Charset::Windows1252,
361        ]
362    }
363
364    /// Convert encoding name string to Charset enum
365    fn parse_charset_name(&self, encoding_name: &str) -> Charset {
366        match encoding_name.to_lowercase().as_str() {
367            "utf-8" | "utf8" => Charset::Utf8,
368            "utf-16le" | "utf16le" => Charset::Utf16Le,
369            "utf-16be" | "utf16be" => Charset::Utf16Be,
370            "utf-32le" | "utf32le" => Charset::Utf32Le,
371            "utf-32be" | "utf32be" => Charset::Utf32Be,
372            "gbk" | "gb2312" => Charset::Gbk,
373            "shift-jis" | "shift_jis" | "sjis" => Charset::ShiftJis,
374            "iso-8859-1" | "iso88591" | "latin1" => Charset::Iso88591,
375            "windows-1252" | "windows1252" | "cp1252" => Charset::Windows1252,
376            "big5" => Charset::Big5,
377            "euc-kr" | "euckr" => Charset::Euckr,
378            _ => Charset::Utf8, // Default fallback
379        }
380    }
381}
382
383#[derive(Debug, Clone)]
384struct EncodingCandidate {
385    charset: Charset,
386    confidence: f32,
387}
388
389impl Default for EncodingDetector {
390    fn default() -> Self {
391        Self::with_defaults()
392    }
393}
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398    use std::fs;
399    use tempfile::TempDir;
400
401    fn create_test_detector() -> EncodingDetector {
402        EncodingDetector {
403            confidence_threshold: 0.7,
404            max_sample_size: 8192,
405            supported_charsets: EncodingDetector::default_charsets(),
406            default_encoding: "utf-8".to_string(),
407        }
408    }
409
410    /// Test UTF-8 encoding detection
411    #[test]
412    fn test_utf8_detection_accuracy() {
413        let detector = create_test_detector();
414        let utf8_text = "Hello, 世界! Bonjour, monde! 🌍";
415
416        let result = detector.detect_encoding(utf8_text.as_bytes()).unwrap();
417
418        assert_eq!(result.charset, Charset::Utf8);
419        assert!(result.confidence > 0.8);
420        assert!(!result.bom_detected);
421        assert!(result.sample_text.contains("Hello"));
422    }
423
424    /// Test UTF-8 BOM detection
425    #[test]
426    fn test_utf8_bom_detection() {
427        let detector = create_test_detector();
428        let mut bom_data = vec![0xEF, 0xBB, 0xBF]; // UTF-8 BOM
429        bom_data.extend_from_slice("Hello, World!".as_bytes());
430
431        let result = detector.detect_encoding(&bom_data).unwrap();
432
433        assert_eq!(result.charset, Charset::Utf8);
434        assert_eq!(result.confidence, 1.0);
435        assert!(result.bom_detected);
436        assert_eq!(result.sample_text, "UTF-8 with BOM");
437    }
438
439    /// Test UTF-16 BOM detection
440    #[test]
441    fn test_utf16_bom_detection() {
442        let detector = create_test_detector();
443
444        // UTF-16 LE BOM
445        let utf16le_data = vec![0xFF, 0xFE, 0x48, 0x00, 0x65, 0x00]; // "He" in UTF-16 LE
446        let result = detector.detect_encoding(&utf16le_data).unwrap();
447        assert_eq!(result.charset, Charset::Utf16Le);
448        assert!(result.bom_detected);
449
450        // UTF-16 BE BOM
451        let utf16be_data = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65]; // "He" in UTF-16 BE
452        let result = detector.detect_encoding(&utf16be_data).unwrap();
453        assert_eq!(result.charset, Charset::Utf16Be);
454        assert!(result.bom_detected);
455    }
456
457    /// Test file encoding detection
458    #[test]
459    fn test_file_encoding_detection() {
460        let detector = create_test_detector();
461        let temp_dir = TempDir::new().unwrap();
462
463        // Create UTF-8 file
464        let utf8_path = temp_dir.path().join("utf8.txt");
465        fs::write(&utf8_path, "測試檔案編碼檢測功能。").unwrap();
466
467        let result = detector
468            .detect_file_encoding(utf8_path.to_str().unwrap())
469            .unwrap();
470
471        assert_eq!(result.charset, Charset::Utf8);
472        assert!(result.confidence > 0.7);
473    }
474
475    /// Test error handling for non-existent files
476    #[test]
477    fn test_nonexistent_file_error() {
478        let detector = create_test_detector();
479        let result = detector.detect_file_encoding("nonexistent.txt");
480
481        assert!(result.is_err());
482    }
483
484    /// Test GBK encoding pattern detection
485    #[test]
486    fn test_gbk_pattern_detection() {
487        let detector = create_test_detector();
488
489        // Simulate GBK encoding pattern (high byte range)
490        let gbk_pattern = vec![
491            0xC4, 0xE3, 0xBA, 0xC3, // "Hello" in GBK encoding
492            0xCA, 0xC0, 0xBD, 0xE7, // "World" in GBK encoding
493        ];
494
495        let result = detector.detect_encoding(&gbk_pattern).unwrap();
496
497        // Should detect as GBK or at least not UTF-8
498        assert!(result.confidence > 0.3);
499        if result.charset == Charset::Gbk {
500            assert!(result.confidence > 0.5);
501        }
502    }
503
504    /// Test Shift-JIS encoding detection
505    #[test]
506    fn test_shift_jis_detection() {
507        let detector = create_test_detector();
508
509        // Simulate Shift-JIS encoding pattern
510        let shift_jis_pattern = vec![
511            0x82, 0xB1, 0x82, 0xF1, // こん (Shift-JIS)
512            0x82, 0xB1, 0x82, 0xF1, // こん (Shift-JIS)
513            0x82, 0xC9, 0x82, 0xBF, // にち (Shift-JIS)
514        ];
515
516        let result = detector.detect_encoding(&shift_jis_pattern).unwrap();
517
518        // Should detect as Shift-JIS or related encoding
519        assert!(result.confidence > 0.2);
520    }
521
522    /// Test encoding confidence ranking
523    #[test]
524    fn test_encoding_confidence_ranking() {
525        let detector = create_test_detector();
526
527        // Clear UTF-8 text should have highest confidence
528        let clear_utf8 = "Clear English text with numbers 123.";
529        let utf8_result = detector.detect_encoding(clear_utf8.as_bytes()).unwrap();
530
531        // Ambiguous data should have lower confidence
532        let ambiguous_data: Vec<u8> = (0x80..=0xFF).cycle().take(50).collect();
533        let ambiguous_result = detector.detect_encoding(&ambiguous_data).unwrap();
534
535        assert!(utf8_result.confidence > ambiguous_result.confidence);
536    }
537
538    /// Test maximum sample size limit
539    #[test]
540    fn test_max_sample_size_limit() {
541        let detector = create_test_detector();
542
543        // Create data exceeding sample size limit
544        let large_data = vec![b'A'; 10000]; // Assuming limit is 8192
545        let result = detector.detect_encoding(&large_data).unwrap();
546
547        // Should successfully detect without failing due to data size
548        assert_eq!(result.charset, Charset::Utf8);
549        assert!(result.confidence > 0.9);
550    }
551
552    /// Test encoding candidate selection logic
553    #[test]
554    fn test_encoding_candidate_selection() {
555        let detector = create_test_detector();
556
557        // Create data with mixed encoding features
558        let mut mixed_data = b"English text ".to_vec();
559        mixed_data.extend_from_slice(&[0xC3, 0xA9]); // é in UTF-8
560        mixed_data.extend_from_slice(b" and more text");
561
562        let result = detector.detect_encoding(&mixed_data).unwrap();
563
564        // Should correctly choose UTF-8
565        assert_eq!(result.charset, Charset::Utf8);
566        assert!(result.confidence > 0.7);
567    }
568
569    /// Test fallback mechanism for unknown encodings
570    #[test]
571    fn test_unknown_encoding_fallback() {
572        let detector = create_test_detector();
573
574        // Create completely random data
575        let random_data: Vec<u8> = (0..100).map(|i| (i * 7 + 13) as u8).collect();
576        let result = detector.detect_encoding(&random_data).unwrap();
577
578        // Should have a fallback encoding choice
579        assert!(result.confidence >= 0.0);
580        assert!(result.confidence <= 1.0);
581    }
582
583    /// Test encoding detection performance
584    #[test]
585    fn test_detection_performance() {
586        let detector = create_test_detector();
587
588        // Create medium-sized text file
589        let large_text = "Hello, World! ".repeat(500);
590
591        let start = std::time::Instant::now();
592        let _result = detector.detect_encoding(large_text.as_bytes()).unwrap();
593        let duration = start.elapsed();
594
595        // Detection should complete within reasonable time (< 100ms)
596        assert!(duration.as_millis() < 100);
597    }
598
599    /// Test default encoding configuration usage
600    #[test]
601    fn test_default_encoding_usage() {
602        // Create detector with GBK as default encoding
603        let mut detector = EncodingDetector {
604            confidence_threshold: 0.95, // Very high threshold to force default usage
605            max_sample_size: 8192,
606            supported_charsets: EncodingDetector::default_charsets(),
607            default_encoding: "gbk".to_string(),
608        };
609
610        // Use truly ambiguous data that won't meet very high confidence threshold
611        // Mixed high-byte data that could be various encodings
612        let ambiguous_data = vec![0x80, 0x81, 0x82, 0x83, 0x84, 0x85];
613        let result = detector.detect_encoding(&ambiguous_data).unwrap();
614
615        // Should fall back to configured default encoding (GBK)
616        assert_eq!(result.charset, Charset::Gbk);
617        assert!(result.sample_text.contains("gbk") || result.sample_text.contains("default"));
618        assert!(result.confidence < 0.95); // Should be fallback confidence
619
620        // Test with UTF-16LE as default
621        detector.default_encoding = "utf-16le".to_string();
622        let result = detector.detect_encoding(&ambiguous_data).unwrap();
623        assert_eq!(result.charset, Charset::Utf16Le);
624        assert!(result.sample_text.contains("utf-16le") || result.sample_text.contains("default"));
625    }
626
627    /// Test encoding name parsing
628    #[test]
629    fn test_encoding_name_parsing() {
630        let detector = create_test_detector();
631
632        // Test various encoding name formats
633        assert_eq!(detector.parse_charset_name("utf-8"), Charset::Utf8);
634        assert_eq!(detector.parse_charset_name("UTF8"), Charset::Utf8);
635        assert_eq!(detector.parse_charset_name("gbk"), Charset::Gbk);
636        assert_eq!(detector.parse_charset_name("GBK"), Charset::Gbk);
637        assert_eq!(detector.parse_charset_name("shift-jis"), Charset::ShiftJis);
638        assert_eq!(detector.parse_charset_name("SHIFT_JIS"), Charset::ShiftJis);
639        assert_eq!(detector.parse_charset_name("big5"), Charset::Big5);
640        assert_eq!(detector.parse_charset_name("iso-8859-1"), Charset::Iso88591);
641        assert_eq!(
642            detector.parse_charset_name("windows-1252"),
643            Charset::Windows1252
644        );
645
646        // Test unknown encoding fallback
647        assert_eq!(
648            detector.parse_charset_name("unknown-encoding"),
649            Charset::Utf8
650        );
651    }
652
653    /// Test configuration integration
654    #[test]
655    fn test_config_integration() {
656        use crate::config::Config;
657
658        // Create config with custom default encoding
659        let mut config = Config::default();
660        config.formats.default_encoding = "gbk".to_string();
661        config.formats.encoding_detection_confidence = 0.9;
662
663        let detector = EncodingDetector::new(&config);
664
665        // Verify configuration was applied
666        assert_eq!(detector.default_encoding, "gbk");
667        assert_eq!(detector.confidence_threshold, 0.9);
668
669        // Test with low-confidence data
670        let ambiguous_data = vec![0x48, 0x65, 0x6C, 0x6C, 0x6F]; // "Hello"
671        let result = detector.detect_encoding(&ambiguous_data).unwrap();
672
673        // Should use GBK as default when confidence is low
674        if result.confidence < 0.9 {
675            assert_eq!(result.charset, Charset::Gbk);
676        }
677    }
678}