subx_cli/core/formats/encoding/
analyzer.rs1use crate::Result;
2use crate::core::formats::encoding::charset::Charset;
3use std::collections::HashMap;
4
5pub struct ByteAnalyzer {
7 byte_frequency: HashMap<u8, usize>,
8 bigram_frequency: HashMap<(u8, u8), usize>,
9 total_bytes: usize,
10}
11
12impl ByteAnalyzer {
13 pub fn new() -> Self {
14 Self {
15 byte_frequency: HashMap::new(),
16 bigram_frequency: HashMap::new(),
17 total_bytes: 0,
18 }
19 }
20
21 pub fn analyze(&mut self, data: &[u8]) -> Result<AnalysisResult> {
22 self.collect_statistics(data);
23 self.calculate_metrics()
24 }
25
26 fn collect_statistics(&mut self, data: &[u8]) {
27 self.total_bytes = data.len();
28 for &b in data {
29 *self.byte_frequency.entry(b).or_insert(0) += 1;
30 }
31 for window in data.windows(2) {
32 if let [b1, b2] = window {
33 *self.bigram_frequency.entry((*b1, *b2)).or_insert(0) += 1;
34 }
35 }
36 }
37
38 fn calculate_metrics(&self) -> Result<AnalysisResult> {
39 let ascii_ratio = self.calculate_ascii_ratio();
40 let entropy = self.calculate_entropy();
41 let control_char_ratio = self.calculate_control_char_ratio();
42 Ok(AnalysisResult {
43 ascii_ratio,
44 entropy,
45 control_char_ratio,
46 byte_distribution: self.byte_frequency.clone(),
47 likely_encodings: self.suggest_encodings(ascii_ratio, entropy, control_char_ratio),
48 })
49 }
50
51 fn calculate_ascii_ratio(&self) -> f32 {
52 let ascii = self
53 .byte_frequency
54 .iter()
55 .filter(|&(&b, _)| b < 0x80)
56 .map(|(_, &c)| c)
57 .sum::<usize>();
58 if self.total_bytes > 0 {
59 ascii as f32 / self.total_bytes as f32
60 } else {
61 0.0
62 }
63 }
64
65 fn calculate_entropy(&self) -> f32 {
66 let mut entropy = 0.0;
67 for &count in self.byte_frequency.values() {
68 if count > 0 {
69 let p = count as f32 / self.total_bytes as f32;
70 entropy -= p * p.log2();
71 }
72 }
73 entropy
74 }
75
76 fn calculate_control_char_ratio(&self) -> f32 {
77 let control = self
78 .byte_frequency
79 .iter()
80 .filter(|&(&b, _)| b < 0x20 && b != 0x09 && b != 0x0A && b != 0x0D)
81 .map(|(_, &c)| c)
82 .sum::<usize>();
83 if self.total_bytes > 0 {
84 control as f32 / self.total_bytes as f32
85 } else {
86 0.0
87 }
88 }
89
90 fn suggest_encodings(
91 &self,
92 ascii_ratio: f32,
93 entropy: f32,
94 control_ratio: f32,
95 ) -> Vec<Charset> {
96 let mut suggestions = Vec::new();
97 if ascii_ratio > 0.9 {
98 suggestions.push(Charset::Utf8);
99 }
100 if entropy > 6.0 && ascii_ratio < 0.8 {
101 suggestions.extend_from_slice(&[Charset::Gbk, Charset::Big5, Charset::ShiftJis]);
102 }
103 if control_ratio > 0.01 {
104 suggestions.push(Charset::Windows1252);
105 }
106 if suggestions.is_empty() {
107 suggestions.push(Charset::Utf8);
108 }
109 suggestions
110 }
111}
112
113#[derive(Debug, Clone)]
115pub struct AnalysisResult {
116 pub ascii_ratio: f32,
117 pub entropy: f32,
118 pub control_char_ratio: f32,
119 pub byte_distribution: HashMap<u8, usize>,
120 pub likely_encodings: Vec<Charset>,
121}
122
123pub struct StatisticalAnalyzer {
125 language_models: HashMap<Charset, LanguageModel>,
126}
127
128impl StatisticalAnalyzer {
129 pub fn new() -> Self {
130 Self {
131 language_models: Self::build_language_models(),
132 }
133 }
134
135 fn build_language_models() -> HashMap<Charset, LanguageModel> {
136 let mut models = HashMap::new();
137 models.insert(
138 Charset::Utf8,
139 LanguageModel {
140 charset: Charset::Utf8,
141 common_patterns: vec![
142 (0xC2, 0.05),
143 (0xC3, 0.08),
144 (0xE2, 0.12),
145 (0xE3, 0.15),
146 (0xE4, 0.18),
147 (0xE5, 0.20),
148 ],
149 invalid_patterns: vec![(0x80, 0.0), (0xBF, 0.0)],
150 },
151 );
152 models.insert(
153 Charset::Gbk,
154 LanguageModel {
155 charset: Charset::Gbk,
156 common_patterns: vec![
157 (0xB0, 0.15),
158 (0xC4, 0.12),
159 (0xD6, 0.10),
160 (0xB8, 0.08),
161 (0xBF, 0.06),
162 (0xCE, 0.05),
163 ],
164 invalid_patterns: vec![(0x7F, 0.0)],
165 },
166 );
167 models
168 }
169
170 pub fn analyze_with_models(&self, data: &[u8]) -> Result<HashMap<Charset, f32>> {
171 let mut scores = HashMap::new();
172 for (cs, model) in &self.language_models {
173 let score = self.calculate_model_score(data, model)?;
174 scores.insert(cs.clone(), score);
175 }
176 Ok(scores)
177 }
178
179 fn calculate_model_score(&self, data: &[u8], model: &LanguageModel) -> Result<f32> {
180 let mut score = 0.0;
181 for &b in data {
182 for &(pb, w) in &model.common_patterns {
183 if b == pb {
184 score += w;
185 }
186 }
187 for &(ib, _) in &model.invalid_patterns {
188 if b == ib {
189 score -= 0.1;
190 }
191 }
192 }
193 Ok(if !data.is_empty() {
194 score / data.len() as f32
195 } else {
196 0.0
197 })
198 }
199}
200
201#[derive(Debug, Clone)]
203struct LanguageModel {
204 charset: Charset,
205 common_patterns: Vec<(u8, f32)>,
206 invalid_patterns: Vec<(u8, f32)>,
207}
208
209impl Default for ByteAnalyzer {
210 fn default() -> Self {
211 Self::new()
212 }
213}
214impl Default for StatisticalAnalyzer {
215 fn default() -> Self {
216 Self::new()
217 }
218}