subx_cli/core/formats/encoding/
analyzer.rs1use crate::Result;
2use crate::core::formats::encoding::charset::Charset;
3use std::collections::HashMap;
4
5pub struct ByteAnalyzer {
7 byte_frequency: HashMap<u8, usize>,
8 bigram_frequency: HashMap<(u8, u8), usize>,
9 total_bytes: usize,
10}
11
12impl ByteAnalyzer {
13 pub fn new() -> Self {
17 Self {
18 byte_frequency: HashMap::new(),
19 bigram_frequency: HashMap::new(),
20 total_bytes: 0,
21 }
22 }
23
24 pub fn analyze(&mut self, data: &[u8]) -> Result<AnalysisResult> {
40 self.collect_statistics(data);
41 self.calculate_metrics()
42 }
43
44 fn collect_statistics(&mut self, data: &[u8]) {
45 self.total_bytes = data.len();
46 for &b in data {
47 *self.byte_frequency.entry(b).or_insert(0) += 1;
48 }
49 for window in data.windows(2) {
50 if let [b1, b2] = window {
51 *self.bigram_frequency.entry((*b1, *b2)).or_insert(0) += 1;
52 }
53 }
54 }
55
56 fn calculate_metrics(&self) -> Result<AnalysisResult> {
57 let ascii_ratio = self.calculate_ascii_ratio();
58 let entropy = self.calculate_entropy();
59 let control_char_ratio = self.calculate_control_char_ratio();
60 Ok(AnalysisResult {
61 ascii_ratio,
62 entropy,
63 control_char_ratio,
64 byte_distribution: self.byte_frequency.clone(),
65 likely_encodings: self.suggest_encodings(ascii_ratio, entropy, control_char_ratio),
66 })
67 }
68
69 fn calculate_ascii_ratio(&self) -> f32 {
70 let ascii = self
71 .byte_frequency
72 .iter()
73 .filter(|&(&b, _)| b < 0x80)
74 .map(|(_, &c)| c)
75 .sum::<usize>();
76 if self.total_bytes > 0 {
77 ascii as f32 / self.total_bytes as f32
78 } else {
79 0.0
80 }
81 }
82
83 fn calculate_entropy(&self) -> f32 {
84 let mut entropy = 0.0;
85 for &count in self.byte_frequency.values() {
86 if count > 0 {
87 let p = count as f32 / self.total_bytes as f32;
88 entropy -= p * p.log2();
89 }
90 }
91 entropy
92 }
93
94 fn calculate_control_char_ratio(&self) -> f32 {
95 let control = self
96 .byte_frequency
97 .iter()
98 .filter(|&(&b, _)| b < 0x20 && b != 0x09 && b != 0x0A && b != 0x0D)
99 .map(|(_, &c)| c)
100 .sum::<usize>();
101 if self.total_bytes > 0 {
102 control as f32 / self.total_bytes as f32
103 } else {
104 0.0
105 }
106 }
107
108 fn suggest_encodings(
109 &self,
110 ascii_ratio: f32,
111 entropy: f32,
112 control_ratio: f32,
113 ) -> Vec<Charset> {
114 let mut suggestions = Vec::new();
115 if ascii_ratio > 0.9 {
116 suggestions.push(Charset::Utf8);
117 }
118 if entropy > 6.0 && ascii_ratio < 0.8 {
119 suggestions.extend_from_slice(&[Charset::Gbk, Charset::Big5, Charset::ShiftJis]);
120 }
121 if control_ratio > 0.01 {
122 suggestions.push(Charset::Windows1252);
123 }
124 if suggestions.is_empty() {
125 suggestions.push(Charset::Utf8);
126 }
127 suggestions
128 }
129}
130
131#[derive(Debug, Clone)]
136pub struct AnalysisResult {
137 pub ascii_ratio: f32,
139 pub entropy: f32,
141 pub control_char_ratio: f32,
143 pub byte_distribution: HashMap<u8, usize>,
145 pub likely_encodings: Vec<Charset>,
147}
148
149pub struct StatisticalAnalyzer {
154 language_models: HashMap<Charset, LanguageModel>,
155}
156
157impl StatisticalAnalyzer {
158 pub fn new() -> Self {
163 Self {
164 language_models: Self::build_language_models(),
165 }
166 }
167
168 fn build_language_models() -> HashMap<Charset, LanguageModel> {
169 let mut models = HashMap::new();
170 models.insert(
171 Charset::Utf8,
172 LanguageModel {
173 charset: Charset::Utf8,
174 common_patterns: vec![
175 (0xC2, 0.05),
176 (0xC3, 0.08),
177 (0xE2, 0.12),
178 (0xE3, 0.15),
179 (0xE4, 0.18),
180 (0xE5, 0.20),
181 ],
182 invalid_patterns: vec![(0x80, 0.0), (0xBF, 0.0)],
183 },
184 );
185 models.insert(
186 Charset::Gbk,
187 LanguageModel {
188 charset: Charset::Gbk,
189 common_patterns: vec![
190 (0xB0, 0.15),
191 (0xC4, 0.12),
192 (0xD6, 0.10),
193 (0xB8, 0.08),
194 (0xBF, 0.06),
195 (0xCE, 0.05),
196 ],
197 invalid_patterns: vec![(0x7F, 0.0)],
198 },
199 );
200 models
201 }
202
203 pub fn analyze_with_models(&self, data: &[u8]) -> Result<HashMap<Charset, f32>> {
219 let mut scores = HashMap::new();
220 for (cs, model) in &self.language_models {
221 let score = self.calculate_model_score(data, model)?;
222 scores.insert(cs.clone(), score);
223 }
224 Ok(scores)
225 }
226
227 fn calculate_model_score(&self, data: &[u8], model: &LanguageModel) -> Result<f32> {
228 let mut score = 0.0;
229 for &b in data {
230 for &(pb, w) in &model.common_patterns {
231 if b == pb {
232 score += w;
233 }
234 }
235 for &(ib, _) in &model.invalid_patterns {
236 if b == ib {
237 score -= 0.1;
238 }
239 }
240 }
241 Ok(if !data.is_empty() {
242 score / data.len() as f32
243 } else {
244 0.0
245 })
246 }
247}
248
249#[derive(Debug, Clone)]
251struct LanguageModel {
252 charset: Charset,
253 common_patterns: Vec<(u8, f32)>,
254 invalid_patterns: Vec<(u8, f32)>,
255}
256
257impl Default for ByteAnalyzer {
258 fn default() -> Self {
259 Self::new()
260 }
261}
262impl Default for StatisticalAnalyzer {
263 fn default() -> Self {
264 Self::new()
265 }
266}
267
268#[cfg(test)]
269mod tests {
270 use super::*;
271
272 #[test]
274 fn test_byte_analyzer_basic_analysis() {
275 let mut analyzer = ByteAnalyzer::new();
276 let test_data = b"Hello, World! 123";
277
278 let result = analyzer.analyze(test_data).unwrap();
279
280 assert!(result.ascii_ratio > 0.9);
282 assert!(result.ascii_ratio <= 1.0);
283
284 assert!(result.entropy > 0.0);
286 assert!(result.entropy < 8.0);
287
288 assert!(result.control_char_ratio < 0.1);
290
291 assert!(result.likely_encodings.contains(&Charset::Utf8));
293 }
294
295 #[test]
297 fn test_chinese_text_analysis() {
298 let mut analyzer = ByteAnalyzer::new();
299 let chinese_text = "你好,世界!測試中文編碼檢測。".as_bytes();
300
301 let result = analyzer.analyze(chinese_text).unwrap();
302
303 assert!(result.ascii_ratio < 0.5);
305
306 assert!(result.entropy > 0.0);
308
309 let has_unicode_encoding = result
311 .likely_encodings
312 .iter()
313 .any(|charset| matches!(charset, Charset::Utf8 | Charset::Gbk | Charset::Big5));
314 assert!(has_unicode_encoding);
315 }
316
317 #[test]
319 fn test_binary_data_analysis() {
320 let mut analyzer = ByteAnalyzer::new();
321 let binary_data: Vec<u8> = (0..=255).cycle().take(1000).collect();
322
323 let result = analyzer.analyze(&binary_data).unwrap();
324
325 assert!(result.entropy > 7.0);
327
328 assert!(result.ascii_ratio > 0.4);
330 assert!(result.ascii_ratio < 0.6);
331 }
332
333 #[test]
335 fn test_entropy_calculation_accuracy() {
336 let mut analyzer = ByteAnalyzer::new();
337
338 let uniform_data: Vec<u8> = (0..=255).collect();
340 let uniform_result = analyzer.analyze(&uniform_data).unwrap();
341
342 analyzer = ByteAnalyzer::new();
344
345 let single_char_data = vec![b'A'; 100];
347 let single_result = analyzer.analyze(&single_char_data).unwrap();
348
349 assert!(uniform_result.entropy > single_result.entropy);
350 assert!(single_result.entropy < 1.0);
351 }
352
353 #[test]
355 fn test_control_character_detection() {
356 let mut analyzer = ByteAnalyzer::new();
357
358 let mut data_with_control = Vec::new();
360 data_with_control.extend_from_slice(b"Normal text ");
361 data_with_control.push(0x01); data_with_control.push(0x02); data_with_control.push(0x1F); data_with_control.extend_from_slice(b" more text");
365
366 let result = analyzer.analyze(&data_with_control).unwrap();
367
368 assert!(result.control_char_ratio > 0.0);
370 assert!(result.control_char_ratio < 0.5);
371
372 assert!(result.likely_encodings.contains(&Charset::Windows1252));
374 }
375
376 #[test]
378 fn test_statistical_analyzer_language_models() {
379 let analyzer = StatisticalAnalyzer::new();
380
381 let utf8_chinese = "这是一个测试文本。".as_bytes();
383 let utf8_scores = analyzer.analyze_with_models(utf8_chinese).unwrap();
384
385 assert!(utf8_scores.contains_key(&Charset::Utf8));
387
388 let gbk_pattern = vec![0xB0, 0xA1, 0xC4, 0xE3, 0xBA, 0xC3]; let gbk_scores = analyzer.analyze_with_models(&gbk_pattern).unwrap();
391
392 assert!(gbk_scores.get(&Charset::Gbk).unwrap_or(&0.0) > &0.0);
394 }
395
396 #[test]
398 fn test_byte_frequency_distribution() {
399 let mut analyzer = ByteAnalyzer::new();
400 let repeated_data = b"aaabbbccc";
401
402 let result = analyzer.analyze(repeated_data).unwrap();
403
404 assert!(!result.byte_distribution.is_empty());
406 assert_eq!(*result.byte_distribution.get(&b'a').unwrap(), 3);
407 assert_eq!(*result.byte_distribution.get(&b'b').unwrap(), 3);
408 assert_eq!(*result.byte_distribution.get(&b'c').unwrap(), 3);
409 }
410
411 #[test]
413 fn test_empty_data_handling() {
414 let mut analyzer = ByteAnalyzer::new();
415 let empty_data = b"";
416
417 let result = analyzer.analyze(empty_data).unwrap();
418
419 assert_eq!(result.ascii_ratio, 0.0);
421 assert_eq!(result.entropy, 0.0);
422 assert_eq!(result.control_char_ratio, 0.0);
423 assert!(!result.likely_encodings.is_empty());
424 }
425
426 #[test]
428 fn test_encoding_suggestion_logic() {
429 let mut analyzer = ByteAnalyzer::new();
430
431 let ascii_heavy = b"Hello World! 123 ABC";
433 let ascii_result = analyzer.analyze(ascii_heavy).unwrap();
434 assert!(ascii_result.likely_encodings.contains(&Charset::Utf8));
435
436 analyzer = ByteAnalyzer::new();
438
439 let multibyte_pattern: Vec<u8> = (0x80..=0xFF).cycle().take(100).collect();
441 let multibyte_result = analyzer.analyze(&multibyte_pattern).unwrap();
442
443 let has_multibyte_encoding = multibyte_result
444 .likely_encodings
445 .iter()
446 .any(|charset| matches!(charset, Charset::Gbk | Charset::Big5 | Charset::ShiftJis));
447 assert!(has_multibyte_encoding);
448 }
449
450 #[test]
452 fn test_bigram_pattern_analysis() {
453 let mut analyzer = ByteAnalyzer::new();
454
455 let pattern_data = b"abcabcabcabc";
457 let _result = analyzer.analyze(pattern_data).unwrap();
458
459 }
462}