subx_cli/core/formats/encoding/
analyzer.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::Charset;
3use std::collections::HashMap;
4
5/// 單字節與雙字節統計分析器
6pub struct ByteAnalyzer {
7    byte_frequency: HashMap<u8, usize>,
8    bigram_frequency: HashMap<(u8, u8), usize>,
9    total_bytes: usize,
10}
11
12impl ByteAnalyzer {
13    pub fn new() -> Self {
14        Self {
15            byte_frequency: HashMap::new(),
16            bigram_frequency: HashMap::new(),
17            total_bytes: 0,
18        }
19    }
20
21    pub fn analyze(&mut self, data: &[u8]) -> Result<AnalysisResult> {
22        self.collect_statistics(data);
23        self.calculate_metrics()
24    }
25
26    fn collect_statistics(&mut self, data: &[u8]) {
27        self.total_bytes = data.len();
28        for &b in data {
29            *self.byte_frequency.entry(b).or_insert(0) += 1;
30        }
31        for window in data.windows(2) {
32            if let [b1, b2] = window {
33                *self.bigram_frequency.entry((*b1, *b2)).or_insert(0) += 1;
34            }
35        }
36    }
37
38    fn calculate_metrics(&self) -> Result<AnalysisResult> {
39        let ascii_ratio = self.calculate_ascii_ratio();
40        let entropy = self.calculate_entropy();
41        let control_char_ratio = self.calculate_control_char_ratio();
42        Ok(AnalysisResult {
43            ascii_ratio,
44            entropy,
45            control_char_ratio,
46            byte_distribution: self.byte_frequency.clone(),
47            likely_encodings: self.suggest_encodings(ascii_ratio, entropy, control_char_ratio),
48        })
49    }
50
51    fn calculate_ascii_ratio(&self) -> f32 {
52        let ascii = self
53            .byte_frequency
54            .iter()
55            .filter(|&(&b, _)| b < 0x80)
56            .map(|(_, &c)| c)
57            .sum::<usize>();
58        if self.total_bytes > 0 {
59            ascii as f32 / self.total_bytes as f32
60        } else {
61            0.0
62        }
63    }
64
65    fn calculate_entropy(&self) -> f32 {
66        let mut entropy = 0.0;
67        for &count in self.byte_frequency.values() {
68            if count > 0 {
69                let p = count as f32 / self.total_bytes as f32;
70                entropy -= p * p.log2();
71            }
72        }
73        entropy
74    }
75
76    fn calculate_control_char_ratio(&self) -> f32 {
77        let control = self
78            .byte_frequency
79            .iter()
80            .filter(|&(&b, _)| b < 0x20 && b != 0x09 && b != 0x0A && b != 0x0D)
81            .map(|(_, &c)| c)
82            .sum::<usize>();
83        if self.total_bytes > 0 {
84            control as f32 / self.total_bytes as f32
85        } else {
86            0.0
87        }
88    }
89
90    fn suggest_encodings(
91        &self,
92        ascii_ratio: f32,
93        entropy: f32,
94        control_ratio: f32,
95    ) -> Vec<Charset> {
96        let mut suggestions = Vec::new();
97        if ascii_ratio > 0.9 {
98            suggestions.push(Charset::Utf8);
99        }
100        if entropy > 6.0 && ascii_ratio < 0.8 {
101            suggestions.extend_from_slice(&[Charset::Gbk, Charset::Big5, Charset::ShiftJis]);
102        }
103        if control_ratio > 0.01 {
104            suggestions.push(Charset::Windows1252);
105        }
106        if suggestions.is_empty() {
107            suggestions.push(Charset::Utf8);
108        }
109        suggestions
110    }
111}
112
113/// 統計分析結果
114#[derive(Debug, Clone)]
115pub struct AnalysisResult {
116    pub ascii_ratio: f32,
117    pub entropy: f32,
118    pub control_char_ratio: f32,
119    pub byte_distribution: HashMap<u8, usize>,
120    pub likely_encodings: Vec<Charset>,
121}
122
123/// 基於語言模型的統計分析器
124pub struct StatisticalAnalyzer {
125    language_models: HashMap<Charset, LanguageModel>,
126}
127
128impl StatisticalAnalyzer {
129    pub fn new() -> Self {
130        Self {
131            language_models: Self::build_language_models(),
132        }
133    }
134
135    fn build_language_models() -> HashMap<Charset, LanguageModel> {
136        let mut models = HashMap::new();
137        models.insert(
138            Charset::Utf8,
139            LanguageModel {
140                charset: Charset::Utf8,
141                common_patterns: vec![
142                    (0xC2, 0.05),
143                    (0xC3, 0.08),
144                    (0xE2, 0.12),
145                    (0xE3, 0.15),
146                    (0xE4, 0.18),
147                    (0xE5, 0.20),
148                ],
149                invalid_patterns: vec![(0x80, 0.0), (0xBF, 0.0)],
150            },
151        );
152        models.insert(
153            Charset::Gbk,
154            LanguageModel {
155                charset: Charset::Gbk,
156                common_patterns: vec![
157                    (0xB0, 0.15),
158                    (0xC4, 0.12),
159                    (0xD6, 0.10),
160                    (0xB8, 0.08),
161                    (0xBF, 0.06),
162                    (0xCE, 0.05),
163                ],
164                invalid_patterns: vec![(0x7F, 0.0)],
165            },
166        );
167        models
168    }
169
170    pub fn analyze_with_models(&self, data: &[u8]) -> Result<HashMap<Charset, f32>> {
171        let mut scores = HashMap::new();
172        for (cs, model) in &self.language_models {
173            let score = self.calculate_model_score(data, model)?;
174            scores.insert(cs.clone(), score);
175        }
176        Ok(scores)
177    }
178
179    fn calculate_model_score(&self, data: &[u8], model: &LanguageModel) -> Result<f32> {
180        let mut score = 0.0;
181        for &b in data {
182            for &(pb, w) in &model.common_patterns {
183                if b == pb {
184                    score += w;
185                }
186            }
187            for &(ib, _) in &model.invalid_patterns {
188                if b == ib {
189                    score -= 0.1;
190                }
191            }
192        }
193        Ok(if !data.is_empty() {
194            score / data.len() as f32
195        } else {
196            0.0
197        })
198    }
199}
200
201/// 語言模型結構
202#[derive(Debug, Clone)]
203struct LanguageModel {
204    charset: Charset,
205    common_patterns: Vec<(u8, f32)>,
206    invalid_patterns: Vec<(u8, f32)>,
207}
208
209impl Default for ByteAnalyzer {
210    fn default() -> Self {
211        Self::new()
212    }
213}
214impl Default for StatisticalAnalyzer {
215    fn default() -> Self {
216        Self::new()
217    }
218}