subx_cli/core/formats/encoding/
detector.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use std::fs::File;
4use std::io::Read;
5
6/// Encoding detection engine
7pub struct EncodingDetector {
8    confidence_threshold: f32,
9    max_sample_size: usize,
10    supported_charsets: Vec<Charset>,
11}
12
13impl EncodingDetector {
14    /// Create encoding detector with configuration
15    pub fn new(config: &crate::config::Config) -> Self {
16        Self {
17            confidence_threshold: config.formats.encoding_detection_confidence,
18            max_sample_size: 8192,
19            supported_charsets: Self::default_charsets(),
20        }
21    }
22
23    /// Create encoding detector with default configuration
24    pub fn with_defaults() -> Self {
25        Self {
26            confidence_threshold: 0.8, // Default confidence threshold
27            max_sample_size: 8192,
28            supported_charsets: Self::default_charsets(),
29        }
30    }
31
32    /// Create encoding detector with custom configuration
33    pub fn with_config(config: &crate::config::Config) -> Self {
34        Self {
35            confidence_threshold: config.formats.encoding_detection_confidence,
36            max_sample_size: 8192,
37            supported_charsets: Self::default_charsets(),
38        }
39    }
40
41    /// Detect file encoding
42    pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
43        let mut file = File::open(file_path)?;
44        let mut buffer = vec![0; self.max_sample_size];
45        let bytes_read = file.read(&mut buffer)?;
46        buffer.truncate(bytes_read);
47        self.detect_encoding(&buffer)
48    }
49
50    /// Detect data encoding
51    pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
52        if let Some(encoding) = self.detect_bom(data) {
53            return Ok(encoding);
54        }
55        let candidates = self.analyze_byte_patterns(data)?;
56        self.select_best_encoding(candidates, data)
57    }
58
59    fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
60        if data.len() < 3 {
61            return None;
62        }
63        match &data[0..3] {
64            [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
65                charset: Charset::Utf8,
66                confidence: 1.0,
67                bom_detected: true,
68                sample_text: String::from("UTF-8 with BOM"),
69            }),
70            [0xFF, 0xFE, ..] => Some(EncodingInfo {
71                charset: Charset::Utf16Le,
72                confidence: 1.0,
73                bom_detected: true,
74                sample_text: String::from("UTF-16 LE with BOM"),
75            }),
76            [0xFE, 0xFF, ..] => Some(EncodingInfo {
77                charset: Charset::Utf16Be,
78                confidence: 1.0,
79                bom_detected: true,
80                sample_text: String::from("UTF-16 BE with BOM"),
81            }),
82            _ => {
83                if data.len() >= 4 {
84                    match &data[0..4] {
85                        [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
86                            charset: Charset::Utf32Le,
87                            confidence: 1.0,
88                            bom_detected: true,
89                            sample_text: String::from("UTF-32 LE with BOM"),
90                        }),
91                        [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
92                            charset: Charset::Utf32Be,
93                            confidence: 1.0,
94                            bom_detected: true,
95                            sample_text: String::from("UTF-32 BE with BOM"),
96                        }),
97                        _ => None,
98                    }
99                } else {
100                    None
101                }
102            }
103        }
104    }
105
106    fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
107        let mut candidates = Vec::new();
108        for charset in &self.supported_charsets {
109            let confidence = self.calculate_encoding_confidence(data, charset)?;
110            if confidence > 0.1 {
111                candidates.push(EncodingCandidate {
112                    charset: charset.clone(),
113                    confidence,
114                });
115            }
116        }
117        candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
118        Ok(candidates)
119    }
120
121    fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
122        match charset {
123            Charset::Utf8 => self.check_utf8_validity(data),
124            Charset::Gbk => self.check_gbk_patterns(data),
125            Charset::ShiftJis => self.check_shift_jis_patterns(data),
126            Charset::Big5 => self.check_big5_patterns(data),
127            Charset::Iso88591 => self.check_iso88591_patterns(data),
128            Charset::Windows1252 => self.check_windows1252_patterns(data),
129            _ => Ok(0.0),
130        }
131    }
132
133    fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
134        let mut valid_chars = 0;
135        let mut total_chars = 0;
136        let mut i = 0;
137
138        while i < data.len() {
139            total_chars += 1;
140            if data[i] & 0x80 == 0 {
141                valid_chars += 1;
142                i += 1;
143            } else if data[i] & 0xE0 == 0xC0 {
144                if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
145                    valid_chars += 1;
146                }
147                i += 2;
148            } else if data[i] & 0xF0 == 0xE0 {
149                if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
150                    valid_chars += 1;
151                }
152                i += 3;
153            } else if data[i] & 0xF8 == 0xF0 {
154                if i + 3 < data.len()
155                    && data[i + 1] & 0xC0 == 0x80
156                    && data[i + 2] & 0xC0 == 0x80
157                    && data[i + 3] & 0xC0 == 0x80
158                {
159                    valid_chars += 1;
160                }
161                i += 4;
162            } else {
163                i += 1;
164            }
165        }
166
167        Ok(if total_chars > 0 {
168            valid_chars as f32 / total_chars as f32
169        } else {
170            0.0
171        })
172    }
173
174    fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
175        let mut valid_chars = 0;
176        let mut total_chars = 0;
177        let mut i = 0;
178
179        while i < data.len() {
180            if data[i] < 0x80 {
181                valid_chars += 1;
182                total_chars += 1;
183                i += 1;
184            } else if i + 1 < data.len() {
185                let byte1 = data[i];
186                let byte2 = data[i + 1];
187                if (0x81..=0xFE).contains(&byte1)
188                    && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
189                {
190                    valid_chars += 1;
191                }
192                total_chars += 1;
193                i += 2;
194            } else {
195                total_chars += 1;
196                i += 1;
197            }
198        }
199
200        Ok(if total_chars > 0 {
201            valid_chars as f32 / total_chars as f32
202        } else {
203            0.0
204        })
205    }
206
207    fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
208        let mut valid_chars = 0;
209        let mut total_chars = 0;
210        let mut i = 0;
211
212        while i < data.len() {
213            if data[i] < 0x80 {
214                valid_chars += 1;
215                total_chars += 1;
216                i += 1;
217            } else if i + 1 < data.len() {
218                let byte1 = data[i];
219                let byte2 = data[i + 1];
220                if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
221                    && (0x40..=0xFC).contains(&byte2)
222                    && byte2 != 0x7F
223                {
224                    valid_chars += 1;
225                }
226                total_chars += 1;
227                i += 2;
228            } else {
229                total_chars += 1;
230                i += 1;
231            }
232        }
233
234        Ok(if total_chars > 0 {
235            valid_chars as f32 / total_chars as f32
236        } else {
237            0.0
238        })
239    }
240
241    fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
242        let mut valid_chars = 0;
243        let mut total_chars = 0;
244        let mut i = 0;
245
246        while i < data.len() {
247            if data[i] < 0x80 {
248                valid_chars += 1;
249                total_chars += 1;
250                i += 1;
251            } else if i + 1 < data.len() {
252                let byte1 = data[i];
253                let byte2 = data[i + 1];
254                if (0xA1..=0xFE).contains(&byte1)
255                    && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
256                {
257                    valid_chars += 1;
258                }
259                total_chars += 1;
260                i += 2;
261            } else {
262                total_chars += 1;
263                i += 1;
264            }
265        }
266
267        Ok(if total_chars > 0 {
268            valid_chars as f32 / total_chars as f32
269        } else {
270            0.0
271        })
272    }
273
274    fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
275        let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
276        let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
277        if extended_count > 0 {
278            let utf8_conf = self.check_utf8_validity(data)?;
279            Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
280        } else {
281            Ok(0.5)
282        }
283    }
284
285    fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
286        let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
287        let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
288        if control_chars > 0 || extended_chars > 0 {
289            let utf8_conf = self.check_utf8_validity(data)?;
290            Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
291        } else {
292            Ok(0.3)
293        }
294    }
295
296    fn select_best_encoding(
297        &self,
298        candidates: Vec<EncodingCandidate>,
299        data: &[u8],
300    ) -> Result<EncodingInfo> {
301        if candidates.is_empty() {
302            return Ok(EncodingInfo {
303                charset: Charset::Unknown,
304                confidence: 0.0,
305                bom_detected: false,
306                sample_text: String::from("Unable to detect encoding"),
307            });
308        }
309        let best = &candidates[0];
310        if best.confidence < self.confidence_threshold {
311            return Ok(EncodingInfo {
312                charset: Charset::Utf8,
313                confidence: 0.5,
314                bom_detected: false,
315                sample_text: "Using default encoding: UTF-8".to_string(),
316            });
317        }
318        let sample = self.decode_sample(data, &best.charset)?;
319        Ok(EncodingInfo {
320            charset: best.charset.clone(),
321            confidence: best.confidence,
322            bom_detected: false,
323            sample_text: sample,
324        })
325    }
326
327    fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
328        let sample_size = data.len().min(200);
329        let sample_data = &data[0..sample_size];
330        match charset {
331            Charset::Utf8 => String::from_utf8(sample_data.to_vec())
332                .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
333            _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
334        }
335    }
336
337    fn default_charsets() -> Vec<Charset> {
338        vec![
339            Charset::Utf8,
340            Charset::Gbk,
341            Charset::ShiftJis,
342            Charset::Big5,
343            Charset::Iso88591,
344            Charset::Windows1252,
345        ]
346    }
347}
348
349#[derive(Debug, Clone)]
350struct EncodingCandidate {
351    charset: Charset,
352    confidence: f32,
353}
354
355impl Default for EncodingDetector {
356    fn default() -> Self {
357        Self::with_defaults()
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364    use std::fs;
365    use tempfile::TempDir;
366
367    fn create_test_detector() -> EncodingDetector {
368        EncodingDetector {
369            confidence_threshold: 0.7,
370            max_sample_size: 8192,
371            supported_charsets: EncodingDetector::default_charsets(),
372        }
373    }
374
375    /// Test UTF-8 encoding detection
376    #[test]
377    fn test_utf8_detection_accuracy() {
378        let detector = create_test_detector();
379        let utf8_text = "Hello, 世界! Bonjour, monde! 🌍";
380
381        let result = detector.detect_encoding(utf8_text.as_bytes()).unwrap();
382
383        assert_eq!(result.charset, Charset::Utf8);
384        assert!(result.confidence > 0.8);
385        assert!(!result.bom_detected);
386        assert!(result.sample_text.contains("Hello"));
387    }
388
389    /// Test UTF-8 BOM detection
390    #[test]
391    fn test_utf8_bom_detection() {
392        let detector = create_test_detector();
393        let mut bom_data = vec![0xEF, 0xBB, 0xBF]; // UTF-8 BOM
394        bom_data.extend_from_slice("Hello, World!".as_bytes());
395
396        let result = detector.detect_encoding(&bom_data).unwrap();
397
398        assert_eq!(result.charset, Charset::Utf8);
399        assert_eq!(result.confidence, 1.0);
400        assert!(result.bom_detected);
401        assert_eq!(result.sample_text, "UTF-8 with BOM");
402    }
403
404    /// Test UTF-16 BOM detection
405    #[test]
406    fn test_utf16_bom_detection() {
407        let detector = create_test_detector();
408
409        // UTF-16 LE BOM
410        let utf16le_data = vec![0xFF, 0xFE, 0x48, 0x00, 0x65, 0x00]; // "He" in UTF-16 LE
411        let result = detector.detect_encoding(&utf16le_data).unwrap();
412        assert_eq!(result.charset, Charset::Utf16Le);
413        assert!(result.bom_detected);
414
415        // UTF-16 BE BOM
416        let utf16be_data = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65]; // "He" in UTF-16 BE
417        let result = detector.detect_encoding(&utf16be_data).unwrap();
418        assert_eq!(result.charset, Charset::Utf16Be);
419        assert!(result.bom_detected);
420    }
421
422    /// Test file encoding detection
423    #[test]
424    fn test_file_encoding_detection() {
425        let detector = create_test_detector();
426        let temp_dir = TempDir::new().unwrap();
427
428        // Create UTF-8 file
429        let utf8_path = temp_dir.path().join("utf8.txt");
430        fs::write(&utf8_path, "測試檔案編碼檢測功能。").unwrap();
431
432        let result = detector
433            .detect_file_encoding(utf8_path.to_str().unwrap())
434            .unwrap();
435
436        assert_eq!(result.charset, Charset::Utf8);
437        assert!(result.confidence > 0.7);
438    }
439
440    /// Test error handling for non-existent files
441    #[test]
442    fn test_nonexistent_file_error() {
443        let detector = create_test_detector();
444        let result = detector.detect_file_encoding("nonexistent.txt");
445
446        assert!(result.is_err());
447    }
448
449    /// Test GBK encoding pattern detection
450    #[test]
451    fn test_gbk_pattern_detection() {
452        let detector = create_test_detector();
453
454        // Simulate GBK encoding pattern (high byte range)
455        let gbk_pattern = vec![
456            0xC4, 0xE3, 0xBA, 0xC3, // "Hello" in GBK encoding
457            0xCA, 0xC0, 0xBD, 0xE7, // "World" in GBK encoding
458        ];
459
460        let result = detector.detect_encoding(&gbk_pattern).unwrap();
461
462        // Should detect as GBK or at least not UTF-8
463        assert!(result.confidence > 0.3);
464        if result.charset == Charset::Gbk {
465            assert!(result.confidence > 0.5);
466        }
467    }
468
469    /// Test Shift-JIS encoding detection
470    #[test]
471    fn test_shift_jis_detection() {
472        let detector = create_test_detector();
473
474        // Simulate Shift-JIS encoding pattern
475        let shift_jis_pattern = vec![
476            0x82, 0xB1, 0x82, 0xF1, // こん (Shift-JIS)
477            0x82, 0xB1, 0x82, 0xF1, // こん (Shift-JIS)
478            0x82, 0xC9, 0x82, 0xBF, // にち (Shift-JIS)
479        ];
480
481        let result = detector.detect_encoding(&shift_jis_pattern).unwrap();
482
483        // Should detect as Shift-JIS or related encoding
484        assert!(result.confidence > 0.2);
485    }
486
487    /// Test encoding confidence ranking
488    #[test]
489    fn test_encoding_confidence_ranking() {
490        let detector = create_test_detector();
491
492        // Clear UTF-8 text should have highest confidence
493        let clear_utf8 = "Clear English text with numbers 123.";
494        let utf8_result = detector.detect_encoding(clear_utf8.as_bytes()).unwrap();
495
496        // Ambiguous data should have lower confidence
497        let ambiguous_data: Vec<u8> = (0x80..=0xFF).cycle().take(50).collect();
498        let ambiguous_result = detector.detect_encoding(&ambiguous_data).unwrap();
499
500        assert!(utf8_result.confidence > ambiguous_result.confidence);
501    }
502
503    /// Test maximum sample size limit
504    #[test]
505    fn test_max_sample_size_limit() {
506        let detector = create_test_detector();
507
508        // Create data exceeding sample size limit
509        let large_data = vec![b'A'; 10000]; // Assuming limit is 8192
510        let result = detector.detect_encoding(&large_data).unwrap();
511
512        // Should successfully detect without failing due to data size
513        assert_eq!(result.charset, Charset::Utf8);
514        assert!(result.confidence > 0.9);
515    }
516
517    /// Test encoding candidate selection logic
518    #[test]
519    fn test_encoding_candidate_selection() {
520        let detector = create_test_detector();
521
522        // Create data with mixed encoding features
523        let mut mixed_data = b"English text ".to_vec();
524        mixed_data.extend_from_slice(&[0xC3, 0xA9]); // é in UTF-8
525        mixed_data.extend_from_slice(b" and more text");
526
527        let result = detector.detect_encoding(&mixed_data).unwrap();
528
529        // Should correctly choose UTF-8
530        assert_eq!(result.charset, Charset::Utf8);
531        assert!(result.confidence > 0.7);
532    }
533
534    /// Test fallback mechanism for unknown encodings
535    #[test]
536    fn test_unknown_encoding_fallback() {
537        let detector = create_test_detector();
538
539        // Create completely random data
540        let random_data: Vec<u8> = (0..100).map(|i| (i * 7 + 13) as u8).collect();
541        let result = detector.detect_encoding(&random_data).unwrap();
542
543        // Should have a fallback encoding choice
544        assert!(result.confidence >= 0.0);
545        assert!(result.confidence <= 1.0);
546    }
547
548    /// Test encoding detection performance
549    #[test]
550    fn test_detection_performance() {
551        let detector = create_test_detector();
552
553        // Create medium-sized text file
554        let large_text = "Hello, World! ".repeat(500);
555
556        let start = std::time::Instant::now();
557        let _result = detector.detect_encoding(large_text.as_bytes()).unwrap();
558        let duration = start.elapsed();
559
560        // Detection should complete within reasonable time (< 100ms)
561        assert!(duration.as_millis() < 100);
562    }
563}