1use crate::core::config::{DictionariesConfig, EncodingMode};
2use crate::core::dictionary::Dictionary;
3use crate::decode;
4use std::collections::HashSet;
5
6#[derive(Debug, Clone)]
8pub struct DictionaryMatch {
9 pub name: String,
11 pub confidence: f64,
13 pub dictionary: Dictionary,
15}
16
17pub struct DictionaryDetector {
19 dictionaries: Vec<(String, Dictionary)>,
20}
21
22impl DictionaryDetector {
23 pub fn new(config: &DictionariesConfig) -> Result<Self, Box<dyn std::error::Error>> {
25 let mut dictionaries = Vec::new();
26
27 for (name, dict_config) in &config.dictionaries {
28 let dictionary = match dict_config.mode {
29 EncodingMode::ByteRange => {
30 let start = dict_config.start_codepoint
31 .ok_or("ByteRange mode requires start_codepoint")?;
32 Dictionary::new_with_mode_and_range(
33 Vec::new(),
34 dict_config.mode.clone(),
35 None,
36 Some(start)
37 )?
38 }
39 _ => {
40 let chars: Vec<char> = dict_config.chars.chars().collect();
41 let padding = dict_config.padding.as_ref().and_then(|s| s.chars().next());
42 Dictionary::new_with_mode(chars, dict_config.mode.clone(), padding)?
43 }
44 };
45 dictionaries.push((name.clone(), dictionary));
46 }
47
48 Ok(DictionaryDetector { dictionaries })
49 }
50
51 pub fn detect(&self, input: &str) -> Vec<DictionaryMatch> {
54 let input = input.trim();
55 if input.is_empty() {
56 return Vec::new();
57 }
58
59 let mut matches = Vec::new();
60
61 for (name, dict) in &self.dictionaries {
62 if let Some(confidence) = self.score_dictionary(input, dict) {
63 matches.push(DictionaryMatch {
64 name: name.clone(),
65 confidence,
66 dictionary: dict.clone(),
67 });
68 }
69 }
70
71 matches.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
73
74 matches
75 }
76
77 fn score_dictionary(&self, input: &str, dict: &Dictionary) -> Option<f64> {
80 let mut score = 0.0;
81 let mut weight_sum = 0.0;
82
83 const CHARSET_WEIGHT: f64 = 0.25;
85 const SPECIFICITY_WEIGHT: f64 = 0.20; const PADDING_WEIGHT: f64 = 0.30; const LENGTH_WEIGHT: f64 = 0.15;
88 const DECODE_WEIGHT: f64 = 0.10;
89
90 let charset_score = self.score_charset(input, dict);
92 score += charset_score * CHARSET_WEIGHT;
93 weight_sum += CHARSET_WEIGHT;
94
95 if charset_score < 0.5 {
97 return None;
98 }
99
100 let specificity_score = self.score_specificity(input, dict);
102 score += specificity_score * SPECIFICITY_WEIGHT;
103 weight_sum += SPECIFICITY_WEIGHT;
104
105 if let Some(padding_score) = self.score_padding(input, dict) {
107 score += padding_score * PADDING_WEIGHT;
108 weight_sum += PADDING_WEIGHT;
109 }
110
111 let length_score = self.score_length(input, dict);
113 score += length_score * LENGTH_WEIGHT;
114 weight_sum += LENGTH_WEIGHT;
115
116 if let Some(decode_score) = self.score_decode(input, dict) {
118 score += decode_score * DECODE_WEIGHT;
119 weight_sum += DECODE_WEIGHT;
120 }
121
122 if weight_sum > 0.0 {
124 Some(score / weight_sum)
125 } else {
126 None
127 }
128 }
129
130 fn score_charset(&self, input: &str, dict: &Dictionary) -> f64 {
132 let input_chars: HashSet<char> = input.chars()
134 .filter(|c| !c.is_whitespace() && Some(*c) != dict.padding())
135 .collect();
136
137 if input_chars.is_empty() {
138 return 0.0;
139 }
140
141 if let Some(start) = dict.start_codepoint() {
143 let in_range = input_chars.iter()
144 .filter(|&&c| {
145 let code = c as u32;
146 code >= start && code < start + 256
147 })
148 .count();
149 return in_range as f64 / input_chars.len() as f64;
150 }
151
152 let mut valid_count = 0;
154 for c in &input_chars {
155 if dict.decode_char(*c).is_some() {
156 valid_count += 1;
157 }
158 }
159
160 if valid_count < input_chars.len() {
161 return 0.0;
163 }
164
165 let dict_size = dict.base();
167 let input_unique = input_chars.len();
168
169 let usage_ratio = input_unique as f64 / dict_size as f64;
171
172 if usage_ratio > 0.7 {
175 1.0
177 } else if usage_ratio > 0.5 {
178 0.85
180 } else if usage_ratio > 0.3 {
181 0.7
183 } else {
184 0.5
187 }
188 }
189
190 fn score_specificity(&self, _input: &str, dict: &Dictionary) -> f64 {
193 let dict_size = dict.base();
194
195 match dict_size {
198 16 => 1.0, 32 => 0.95, 58 => 0.90, 62 => 0.88, 64 => 0.92, 85 => 0.70, 256 => 0.60, _ if dict_size < 64 => 0.85,
206 _ if dict_size < 128 => 0.75,
207 _ => 0.65,
208 }
209 }
210
211 fn score_padding(&self, input: &str, dict: &Dictionary) -> Option<f64> {
213 let padding = dict.padding()?;
214
215 if *dict.mode() == EncodingMode::Chunked {
217 let has_padding = input.ends_with(padding);
218 let padding_count = input.chars().filter(|c| *c == padding).count();
219
220 if has_padding {
221 let trimmed = input.trim_end_matches(padding);
223 let internal_padding = trimmed.chars().any(|c| c == padding);
224
225 if internal_padding {
226 Some(0.5) } else if padding_count <= 3 {
228 Some(1.0) } else {
230 Some(0.3) }
232 } else {
233 Some(0.8)
235 }
236 } else {
237 None
238 }
239 }
240
241 fn score_length(&self, input: &str, dict: &Dictionary) -> f64 {
243 let length = input.trim().len();
244
245 match dict.mode() {
246 EncodingMode::Chunked => {
247 let base = dict.base();
249
250 let trimmed = if let Some(pad) = dict.padding() {
252 input.trim_end_matches(pad)
253 } else {
254 input
255 };
256
257 let expected_multiple = match base {
261 64 => 4,
262 32 => 8,
263 16 => 2,
264 _ => return 0.5, };
266
267 if trimmed.len() % expected_multiple == 0 {
268 1.0
269 } else {
270 0.3
271 }
272 }
273 EncodingMode::ByteRange => {
274 1.0
276 }
277 EncodingMode::BaseConversion => {
278 if length > 0 {
280 1.0
281 } else {
282 0.0
283 }
284 }
285 }
286 }
287
288 fn score_decode(&self, input: &str, dict: &Dictionary) -> Option<f64> {
290 match decode(input, dict) {
291 Ok(decoded) => {
292 if decoded.is_empty() {
293 Some(0.5)
294 } else {
295 Some(1.0)
297 }
298 }
299 Err(_) => {
300 Some(0.0)
302 }
303 }
304 }
305}
306
307pub fn detect_dictionary(input: &str) -> Result<Vec<DictionaryMatch>, Box<dyn std::error::Error>> {
309 let config = DictionariesConfig::load_with_overrides()?;
310 let detector = DictionaryDetector::new(&config)?;
311 Ok(detector.detect(input))
312}
313
314#[cfg(test)]
315mod tests {
316 use super::*;
317 use crate::encode;
318
319 #[test]
320 fn test_detect_base64() {
321 let config = DictionariesConfig::load_default().unwrap();
322 let detector = DictionaryDetector::new(&config).unwrap();
323
324 let matches = detector.detect("SGVsbG8sIFdvcmxkIQ==");
326 assert!(!matches.is_empty());
327 assert!(matches[0].name == "base64" || matches[0].name == "base64url");
329 assert!(matches[0].confidence > 0.7);
330 }
331
332 #[test]
333 fn test_detect_base32() {
334 let config = DictionariesConfig::load_default().unwrap();
335 let detector = DictionaryDetector::new(&config).unwrap();
336
337 let matches = detector.detect("JBSWY3DPEBLW64TMMQ======");
338 assert!(!matches.is_empty());
339 let base32_found = matches.iter().take(5).any(|m| m.name.starts_with("base32"));
341 assert!(base32_found, "base32 should be in top 5 candidates");
342 }
343
344 #[test]
345 fn test_detect_hex() {
346 let config = DictionariesConfig::load_default().unwrap();
347 let detector = DictionaryDetector::new(&config).unwrap();
348
349 let matches = detector.detect("48656c6c6f");
350 assert!(!matches.is_empty());
351 assert!(matches[0].name == "hex" || matches[0].name == "hex_math");
353 assert!(matches[0].confidence > 0.8);
354 }
355
356 #[test]
357 fn test_detect_from_encoded() {
358 let config = DictionariesConfig::load_default().unwrap();
359
360 let dict_config = config.get_dictionary("base64").unwrap();
362 let chars: Vec<char> = dict_config.chars.chars().collect();
363 let padding = dict_config.padding.as_ref().and_then(|s| s.chars().next());
364 let dict = Dictionary::new_with_mode(chars, dict_config.mode.clone(), padding).unwrap();
365
366 let data = b"Hello, World!";
367 let encoded = encode(data, &dict);
368
369 let detector = DictionaryDetector::new(&config).unwrap();
370 let matches = detector.detect(&encoded);
371
372 assert!(!matches.is_empty());
373 assert!(matches[0].name == "base64" || matches[0].name == "base64url");
375 }
376
377 #[test]
378 fn test_detect_empty_input() {
379 let config = DictionariesConfig::load_default().unwrap();
380 let detector = DictionaryDetector::new(&config).unwrap();
381
382 let matches = detector.detect("");
383 assert!(matches.is_empty());
384 }
385
386 #[test]
387 fn test_detect_invalid_input() {
388 let config = DictionariesConfig::load_default().unwrap();
389 let detector = DictionaryDetector::new(&config).unwrap();
390
391 let matches = detector.detect("こんにちは世界");
393 if !matches.is_empty() {
395 assert!(matches[0].confidence < 0.5);
396 }
397 }
398}