subx_cli/core/formats/encoding/
analyzer.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::Charset;
3use std::collections::HashMap;
4
5/// Single-byte and double-byte statistical analyzer
6pub struct ByteAnalyzer {
7    byte_frequency: HashMap<u8, usize>,
8    bigram_frequency: HashMap<(u8, u8), usize>,
9    total_bytes: usize,
10}
11
12impl ByteAnalyzer {
13    /// Creates a new ByteAnalyzer instance.
14    ///
15    /// Initializes empty frequency maps and resets counters.
16    pub fn new() -> Self {
17        Self {
18            byte_frequency: HashMap::new(),
19            bigram_frequency: HashMap::new(),
20            total_bytes: 0,
21        }
22    }
23
24    /// Analyzes the given byte data and returns encoding analysis results.
25    ///
26    /// # Arguments
27    ///
28    /// * `data` - The byte data to analyze for encoding detection
29    ///
30    /// # Returns
31    ///
32    /// Returns an `AnalysisResult` containing statistical information about
33    /// the data that can be used for encoding detection.
34    ///
35    /// # Errors
36    ///
37    /// Returns an error if the analysis cannot be completed due to
38    /// insufficient data or computational issues.
39    pub fn analyze(&mut self, data: &[u8]) -> Result<AnalysisResult> {
40        self.collect_statistics(data);
41        self.calculate_metrics()
42    }
43
44    fn collect_statistics(&mut self, data: &[u8]) {
45        self.total_bytes = data.len();
46        for &b in data {
47            *self.byte_frequency.entry(b).or_insert(0) += 1;
48        }
49        for window in data.windows(2) {
50            if let [b1, b2] = window {
51                *self.bigram_frequency.entry((*b1, *b2)).or_insert(0) += 1;
52            }
53        }
54    }
55
56    fn calculate_metrics(&self) -> Result<AnalysisResult> {
57        let ascii_ratio = self.calculate_ascii_ratio();
58        let entropy = self.calculate_entropy();
59        let control_char_ratio = self.calculate_control_char_ratio();
60        Ok(AnalysisResult {
61            ascii_ratio,
62            entropy,
63            control_char_ratio,
64            byte_distribution: self.byte_frequency.clone(),
65            likely_encodings: self.suggest_encodings(ascii_ratio, entropy, control_char_ratio),
66        })
67    }
68
69    fn calculate_ascii_ratio(&self) -> f32 {
70        let ascii = self
71            .byte_frequency
72            .iter()
73            .filter(|&(&b, _)| b < 0x80)
74            .map(|(_, &c)| c)
75            .sum::<usize>();
76        if self.total_bytes > 0 {
77            ascii as f32 / self.total_bytes as f32
78        } else {
79            0.0
80        }
81    }
82
83    fn calculate_entropy(&self) -> f32 {
84        let mut entropy = 0.0;
85        for &count in self.byte_frequency.values() {
86            if count > 0 {
87                let p = count as f32 / self.total_bytes as f32;
88                entropy -= p * p.log2();
89            }
90        }
91        entropy
92    }
93
94    fn calculate_control_char_ratio(&self) -> f32 {
95        let control = self
96            .byte_frequency
97            .iter()
98            .filter(|&(&b, _)| b < 0x20 && b != 0x09 && b != 0x0A && b != 0x0D)
99            .map(|(_, &c)| c)
100            .sum::<usize>();
101        if self.total_bytes > 0 {
102            control as f32 / self.total_bytes as f32
103        } else {
104            0.0
105        }
106    }
107
108    fn suggest_encodings(
109        &self,
110        ascii_ratio: f32,
111        entropy: f32,
112        control_ratio: f32,
113    ) -> Vec<Charset> {
114        let mut suggestions = Vec::new();
115        if ascii_ratio > 0.9 {
116            suggestions.push(Charset::Utf8);
117        }
118        if entropy > 6.0 && ascii_ratio < 0.8 {
119            suggestions.extend_from_slice(&[Charset::Gbk, Charset::Big5, Charset::ShiftJis]);
120        }
121        if control_ratio > 0.01 {
122            suggestions.push(Charset::Windows1252);
123        }
124        if suggestions.is_empty() {
125            suggestions.push(Charset::Utf8);
126        }
127        suggestions
128    }
129}
130
131/// Statistical analysis result for encoding detection.
132///
133/// Contains various metrics computed from byte data analysis that help
134/// determine the most likely character encoding for text data.
135#[derive(Debug, Clone)]
136pub struct AnalysisResult {
137    /// Ratio of ASCII characters (0-127) in the data
138    pub ascii_ratio: f32,
139    /// Shannon entropy of the byte distribution
140    pub entropy: f32,
141    /// Ratio of control characters in the data
142    pub control_char_ratio: f32,
143    /// Frequency distribution of all bytes
144    pub byte_distribution: HashMap<u8, usize>,
145    /// List of encodings ordered by likelihood
146    pub likely_encodings: Vec<Charset>,
147}
148
149/// Statistical language model-based analyzer for encoding detection.
150///
151/// Uses statistical models and language patterns to improve encoding
152/// detection accuracy beyond simple byte frequency analysis.
153pub struct StatisticalAnalyzer {
154    language_models: HashMap<Charset, LanguageModel>,
155}
156
157impl StatisticalAnalyzer {
158    /// Creates a new StatisticalAnalyzer with pre-built language models.
159    ///
160    /// Initializes language models for various character encodings to
161    /// enable statistical analysis of text patterns.
162    pub fn new() -> Self {
163        Self {
164            language_models: Self::build_language_models(),
165        }
166    }
167
168    fn build_language_models() -> HashMap<Charset, LanguageModel> {
169        let mut models = HashMap::new();
170        models.insert(
171            Charset::Utf8,
172            LanguageModel {
173                charset: Charset::Utf8,
174                common_patterns: vec![
175                    (0xC2, 0.05),
176                    (0xC3, 0.08),
177                    (0xE2, 0.12),
178                    (0xE3, 0.15),
179                    (0xE4, 0.18),
180                    (0xE5, 0.20),
181                ],
182                invalid_patterns: vec![(0x80, 0.0), (0xBF, 0.0)],
183            },
184        );
185        models.insert(
186            Charset::Gbk,
187            LanguageModel {
188                charset: Charset::Gbk,
189                common_patterns: vec![
190                    (0xB0, 0.15),
191                    (0xC4, 0.12),
192                    (0xD6, 0.10),
193                    (0xB8, 0.08),
194                    (0xBF, 0.06),
195                    (0xCE, 0.05),
196                ],
197                invalid_patterns: vec![(0x7F, 0.0)],
198            },
199        );
200        models
201    }
202
203    /// Analyzes byte data using language models to determine encoding likelihood.
204    ///
205    /// # Arguments
206    ///
207    /// * `data` - The byte data to analyze
208    ///
209    /// # Returns
210    ///
211    /// Returns a HashMap mapping each charset to its likelihood score.
212    /// Higher scores indicate higher likelihood that the data is encoded
213    /// in that character set.
214    ///
215    /// # Errors
216    ///
217    /// Returns an error if the model scoring calculation fails.
218    pub fn analyze_with_models(&self, data: &[u8]) -> Result<HashMap<Charset, f32>> {
219        let mut scores = HashMap::new();
220        for (cs, model) in &self.language_models {
221            let score = self.calculate_model_score(data, model)?;
222            scores.insert(cs.clone(), score);
223        }
224        Ok(scores)
225    }
226
227    fn calculate_model_score(&self, data: &[u8], model: &LanguageModel) -> Result<f32> {
228        let mut score = 0.0;
229        for &b in data {
230            for &(pb, w) in &model.common_patterns {
231                if b == pb {
232                    score += w;
233                }
234            }
235            for &(ib, _) in &model.invalid_patterns {
236                if b == ib {
237                    score -= 0.1;
238                }
239            }
240        }
241        Ok(if !data.is_empty() {
242            score / data.len() as f32
243        } else {
244            0.0
245        })
246    }
247}
248
249/// Language model structure
250#[derive(Debug, Clone)]
251struct LanguageModel {
252    charset: Charset,
253    common_patterns: Vec<(u8, f32)>,
254    invalid_patterns: Vec<(u8, f32)>,
255}
256
257impl Default for ByteAnalyzer {
258    fn default() -> Self {
259        Self::new()
260    }
261}
262impl Default for StatisticalAnalyzer {
263    fn default() -> Self {
264        Self::new()
265    }
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    /// Test byte analyzer basic functionality
273    #[test]
274    fn test_byte_analyzer_basic_analysis() {
275        let mut analyzer = ByteAnalyzer::new();
276        let test_data = b"Hello, World! 123";
277
278        let result = analyzer.analyze(test_data).unwrap();
279
280        // Verify ASCII ratio
281        assert!(result.ascii_ratio > 0.9);
282        assert!(result.ascii_ratio <= 1.0);
283
284        // Verify entropy within reasonable range
285        assert!(result.entropy > 0.0);
286        assert!(result.entropy < 8.0);
287
288        // Verify control character ratio
289        assert!(result.control_char_ratio < 0.1);
290
291        // Verify encoding suggestions
292        assert!(result.likely_encodings.contains(&Charset::Utf8));
293    }
294
295    /// Test Chinese text encoding analysis
296    #[test]
297    fn test_chinese_text_analysis() {
298        let mut analyzer = ByteAnalyzer::new();
299        let chinese_text = "你好,世界!測試中文編碼檢測。".as_bytes();
300
301        let result = analyzer.analyze(chinese_text).unwrap();
302
303        // Chinese text should have lower ASCII ratio
304        assert!(result.ascii_ratio < 0.5);
305
306        // Entropy should be greater than zero
307        assert!(result.entropy > 0.0);
308
309        // Should suggest UTF-8 or other Chinese encodings
310        let has_unicode_encoding = result
311            .likely_encodings
312            .iter()
313            .any(|charset| matches!(charset, Charset::Utf8 | Charset::Gbk | Charset::Big5));
314        assert!(has_unicode_encoding);
315    }
316
317    /// Test binary data analysis
318    #[test]
319    fn test_binary_data_analysis() {
320        let mut analyzer = ByteAnalyzer::new();
321        let binary_data: Vec<u8> = (0..=255).cycle().take(1000).collect();
322
323        let result = analyzer.analyze(&binary_data).unwrap();
324
325        // Binary data should have high entropy
326        assert!(result.entropy > 7.0);
327
328        // ASCII ratio should be approximately 50%
329        assert!(result.ascii_ratio > 0.4);
330        assert!(result.ascii_ratio < 0.6);
331    }
332
333    /// Test entropy calculation accuracy
334    #[test]
335    fn test_entropy_calculation_accuracy() {
336        let mut analyzer = ByteAnalyzer::new();
337
338        // Completely uniform distribution should have maximum entropy
339        let uniform_data: Vec<u8> = (0..=255).collect();
340        let uniform_result = analyzer.analyze(&uniform_data).unwrap();
341
342        // Reset analyzer
343        analyzer = ByteAnalyzer::new();
344
345        // Single character should have minimal entropy value
346        let single_char_data = vec![b'A'; 100];
347        let single_result = analyzer.analyze(&single_char_data).unwrap();
348
349        assert!(uniform_result.entropy > single_result.entropy);
350        assert!(single_result.entropy < 1.0);
351    }
352
353    /// Test control character detection
354    #[test]
355    fn test_control_character_detection() {
356        let mut analyzer = ByteAnalyzer::new();
357
358        // Create data containing control characters
359        let mut data_with_control = Vec::new();
360        data_with_control.extend_from_slice(b"Normal text ");
361        data_with_control.push(0x01); // SOH
362        data_with_control.push(0x02); // STX
363        data_with_control.push(0x1F); // US
364        data_with_control.extend_from_slice(b" more text");
365
366        let result = analyzer.analyze(&data_with_control).unwrap();
367
368        // Should detect control characters
369        assert!(result.control_char_ratio > 0.0);
370        assert!(result.control_char_ratio < 0.5);
371
372        // May suggest Windows-1252 encoding
373        assert!(result.likely_encodings.contains(&Charset::Windows1252));
374    }
375
376    /// Test statistical analyzer language models
377    #[test]
378    fn test_statistical_analyzer_language_models() {
379        let analyzer = StatisticalAnalyzer::new();
380
381        // Test UTF-8 Chinese text
382        let utf8_chinese = "这是一个测试文本。".as_bytes();
383        let utf8_scores = analyzer.analyze_with_models(utf8_chinese).unwrap();
384
385        // UTF-8 should be detected as candidate encoding
386        assert!(utf8_scores.contains_key(&Charset::Utf8));
387
388        // Test GBK pattern text
389        let gbk_pattern = vec![0xB0, 0xA1, 0xC4, 0xE3, 0xBA, 0xC3]; // Simulate GBK encoding
390        let gbk_scores = analyzer.analyze_with_models(&gbk_pattern).unwrap();
391
392        // GBK should have reasonable score
393        assert!(gbk_scores.get(&Charset::Gbk).unwrap_or(&0.0) > &0.0);
394    }
395
396    /// Test byte frequency distribution analysis
397    #[test]
398    fn test_byte_frequency_distribution() {
399        let mut analyzer = ByteAnalyzer::new();
400        let repeated_data = b"aaabbbccc";
401
402        let result = analyzer.analyze(repeated_data).unwrap();
403
404        // Verify byte distribution is correctly recorded
405        assert!(!result.byte_distribution.is_empty());
406        assert_eq!(*result.byte_distribution.get(&b'a').unwrap(), 3);
407        assert_eq!(*result.byte_distribution.get(&b'b').unwrap(), 3);
408        assert_eq!(*result.byte_distribution.get(&b'c').unwrap(), 3);
409    }
410
411    /// Test empty data handling
412    #[test]
413    fn test_empty_data_handling() {
414        let mut analyzer = ByteAnalyzer::new();
415        let empty_data = b"";
416
417        let result = analyzer.analyze(empty_data).unwrap();
418
419        // Empty data should return default values
420        assert_eq!(result.ascii_ratio, 0.0);
421        assert_eq!(result.entropy, 0.0);
422        assert_eq!(result.control_char_ratio, 0.0);
423        assert!(!result.likely_encodings.is_empty());
424    }
425
426    /// Test encoding suggestion logic
427    #[test]
428    fn test_encoding_suggestion_logic() {
429        let mut analyzer = ByteAnalyzer::new();
430
431        // High ASCII ratio should suggest UTF-8
432        let ascii_heavy = b"Hello World! 123 ABC";
433        let ascii_result = analyzer.analyze(ascii_heavy).unwrap();
434        assert!(ascii_result.likely_encodings.contains(&Charset::Utf8));
435
436        // Reset analyzer
437        analyzer = ByteAnalyzer::new();
438
439        // High entropy and low ASCII ratio should suggest multibyte encodings
440        let multibyte_pattern: Vec<u8> = (0x80..=0xFF).cycle().take(100).collect();
441        let multibyte_result = analyzer.analyze(&multibyte_pattern).unwrap();
442
443        let has_multibyte_encoding = multibyte_result
444            .likely_encodings
445            .iter()
446            .any(|charset| matches!(charset, Charset::Gbk | Charset::Big5 | Charset::ShiftJis));
447        assert!(has_multibyte_encoding);
448    }
449
450    /// Test bigram pattern analysis
451    #[test]
452    fn test_bigram_pattern_analysis() {
453        let mut analyzer = ByteAnalyzer::new();
454
455        // Create data with obvious bigram patterns
456        let pattern_data = b"abcabcabcabc";
457        let _result = analyzer.analyze(pattern_data).unwrap();
458
459        // Note: Current implementation collects bigram frequencies but doesn't use them in results
460        // This can be extended to verify bigram analysis logic
461    }
462}