base_d/features/
detection.rs1use crate::core::config::{DictionaryRegistry, EncodingMode};
2use crate::core::dictionary::Dictionary;
3use crate::decode;
4use std::collections::HashSet;
5
6#[derive(Debug, Clone)]
8pub struct DictionaryMatch {
9 pub name: String,
11 pub confidence: f64,
13 pub dictionary: Dictionary,
15}
16
17pub struct DictionaryDetector {
19 dictionaries: Vec<(String, Dictionary)>,
20}
21
22impl DictionaryDetector {
23 pub fn new(config: &DictionaryRegistry) -> Result<Self, Box<dyn std::error::Error>> {
25 let mut dictionaries = Vec::new();
26
27 for (name, dict_config) in &config.dictionaries {
28 let dictionary = match dict_config.mode {
29 EncodingMode::ByteRange => {
30 let start = dict_config
31 .start_codepoint
32 .ok_or("ByteRange mode requires start_codepoint")?;
33 Dictionary::builder()
34 .mode(dict_config.mode.clone())
35 .start_codepoint(start)
36 .build()?
37 }
38 _ => {
39 let chars: Vec<char> = dict_config.chars.chars().collect();
40 let padding = dict_config.padding.as_ref().and_then(|s| s.chars().next());
41 let mut builder = Dictionary::builder()
42 .chars(chars)
43 .mode(dict_config.mode.clone());
44 if let Some(p) = padding {
45 builder = builder.padding(p);
46 }
47 builder.build()?
48 }
49 };
50 dictionaries.push((name.clone(), dictionary));
51 }
52
53 Ok(DictionaryDetector { dictionaries })
54 }
55
56 pub fn detect(&self, input: &str) -> Vec<DictionaryMatch> {
59 let input = input.trim();
60 if input.is_empty() {
61 return Vec::new();
62 }
63
64 let mut matches = Vec::new();
65
66 for (name, dict) in &self.dictionaries {
67 if let Some(confidence) = self.score_dictionary(input, dict) {
68 matches.push(DictionaryMatch {
69 name: name.clone(),
70 confidence,
71 dictionary: dict.clone(),
72 });
73 }
74 }
75
76 matches.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
78
79 matches
80 }
81
82 fn score_dictionary(&self, input: &str, dict: &Dictionary) -> Option<f64> {
85 let mut score = 0.0;
86 let mut weight_sum = 0.0;
87
88 const CHARSET_WEIGHT: f64 = 0.25;
90 const SPECIFICITY_WEIGHT: f64 = 0.20; const PADDING_WEIGHT: f64 = 0.30; const LENGTH_WEIGHT: f64 = 0.15;
93 const DECODE_WEIGHT: f64 = 0.10;
94
95 let charset_score = self.score_charset(input, dict);
97 score += charset_score * CHARSET_WEIGHT;
98 weight_sum += CHARSET_WEIGHT;
99
100 if charset_score < 0.5 {
102 return None;
103 }
104
105 let specificity_score = self.score_specificity(input, dict);
107 score += specificity_score * SPECIFICITY_WEIGHT;
108 weight_sum += SPECIFICITY_WEIGHT;
109
110 if let Some(padding_score) = self.score_padding(input, dict) {
112 score += padding_score * PADDING_WEIGHT;
113 weight_sum += PADDING_WEIGHT;
114 }
115
116 let length_score = self.score_length(input, dict);
118 score += length_score * LENGTH_WEIGHT;
119 weight_sum += LENGTH_WEIGHT;
120
121 if let Some(decode_score) = self.score_decode(input, dict) {
123 score += decode_score * DECODE_WEIGHT;
124 weight_sum += DECODE_WEIGHT;
125 }
126
127 if weight_sum > 0.0 {
129 Some(score / weight_sum)
130 } else {
131 None
132 }
133 }
134
135 fn score_charset(&self, input: &str, dict: &Dictionary) -> f64 {
137 let input_chars: HashSet<char> = input
139 .chars()
140 .filter(|c| !c.is_whitespace() && Some(*c) != dict.padding())
141 .collect();
142
143 if input_chars.is_empty() {
144 return 0.0;
145 }
146
147 if let Some(start) = dict.start_codepoint() {
149 let in_range = input_chars
150 .iter()
151 .filter(|&&c| {
152 let code = c as u32;
153 code >= start && code < start + 256
154 })
155 .count();
156 return in_range as f64 / input_chars.len() as f64;
157 }
158
159 let mut valid_count = 0;
161 for c in &input_chars {
162 if dict.decode_char(*c).is_some() {
163 valid_count += 1;
164 }
165 }
166
167 if valid_count < input_chars.len() {
168 return 0.0;
170 }
171
172 let dict_size = dict.base();
174 let input_unique = input_chars.len();
175
176 let usage_ratio = input_unique as f64 / dict_size as f64;
178
179 if usage_ratio > 0.7 {
182 1.0
184 } else if usage_ratio > 0.5 {
185 0.85
187 } else if usage_ratio > 0.3 {
188 0.7
190 } else {
191 0.5
194 }
195 }
196
197 fn score_specificity(&self, _input: &str, dict: &Dictionary) -> f64 {
200 let dict_size = dict.base();
201
202 match dict_size {
205 16 => 1.0, 32 => 0.95, 58 => 0.90, 62 => 0.88, 64 => 0.92, 85 => 0.70, 256 => 0.60, _ if dict_size < 64 => 0.85,
213 _ if dict_size < 128 => 0.75,
214 _ => 0.65,
215 }
216 }
217
218 fn score_padding(&self, input: &str, dict: &Dictionary) -> Option<f64> {
220 let padding = dict.padding()?;
221
222 if *dict.mode() == EncodingMode::Chunked {
224 let has_padding = input.ends_with(padding);
225 let padding_count = input.chars().filter(|c| *c == padding).count();
226
227 if has_padding {
228 let trimmed = input.trim_end_matches(padding);
230 let internal_padding = trimmed.chars().any(|c| c == padding);
231
232 if internal_padding {
233 Some(0.5) } else if padding_count <= 3 {
235 Some(1.0) } else {
237 Some(0.3) }
239 } else {
240 Some(0.8)
242 }
243 } else {
244 None
245 }
246 }
247
248 fn score_length(&self, input: &str, dict: &Dictionary) -> f64 {
250 let length = input.trim().len();
251
252 match dict.mode() {
253 EncodingMode::Chunked => {
254 let base = dict.base();
256
257 let trimmed = if let Some(pad) = dict.padding() {
259 input.trim_end_matches(pad)
260 } else {
261 input
262 };
263
264 let expected_multiple = match base {
268 64 => 4,
269 32 => 8,
270 16 => 2,
271 _ => return 0.5, };
273
274 if trimmed.len() % expected_multiple == 0 {
275 1.0
276 } else {
277 0.3
278 }
279 }
280 EncodingMode::ByteRange => {
281 1.0
283 }
284 EncodingMode::BaseConversion => {
285 if length > 0 { 1.0 } else { 0.0 }
287 }
288 }
289 }
290
291 fn score_decode(&self, input: &str, dict: &Dictionary) -> Option<f64> {
293 match decode(input, dict) {
294 Ok(decoded) => {
295 if decoded.is_empty() {
296 Some(0.5)
297 } else {
298 Some(1.0)
300 }
301 }
302 Err(_) => {
303 Some(0.0)
305 }
306 }
307 }
308}
309
310pub fn detect_dictionary(input: &str) -> Result<Vec<DictionaryMatch>, Box<dyn std::error::Error>> {
312 let config = DictionaryRegistry::load_with_overrides()?;
313 let detector = DictionaryDetector::new(&config)?;
314 Ok(detector.detect(input))
315}
316
317#[cfg(test)]
318mod tests {
319 use super::*;
320 use crate::encode;
321
322 #[test]
323 fn test_detect_base64() {
324 let config = DictionaryRegistry::load_default().unwrap();
325 let detector = DictionaryDetector::new(&config).unwrap();
326
327 let matches = detector.detect("SGVsbG8sIFdvcmxkIQ==");
329 assert!(!matches.is_empty());
330 assert!(matches[0].name == "base64" || matches[0].name == "base64url");
332 assert!(matches[0].confidence > 0.7);
333 }
334
335 #[test]
336 fn test_detect_base32() {
337 let config = DictionaryRegistry::load_default().unwrap();
338 let detector = DictionaryDetector::new(&config).unwrap();
339
340 let matches = detector.detect("JBSWY3DPEBLW64TMMQ======");
341 assert!(!matches.is_empty());
342 let base32_found = matches.iter().take(5).any(|m| m.name.starts_with("base32"));
344 assert!(base32_found, "base32 should be in top 5 candidates");
345 }
346
347 #[test]
348 fn test_detect_hex() {
349 let config = DictionaryRegistry::load_default().unwrap();
350 let detector = DictionaryDetector::new(&config).unwrap();
351
352 let matches = detector.detect("48656c6c6f");
353 assert!(!matches.is_empty());
354 assert!(matches[0].name == "hex" || matches[0].name == "hex_math");
356 assert!(matches[0].confidence > 0.8);
357 }
358
359 #[test]
360 fn test_detect_from_encoded() {
361 let config = DictionaryRegistry::load_default().unwrap();
362
363 let dict_config = config.get_dictionary("base64").unwrap();
365 let chars: Vec<char> = dict_config.chars.chars().collect();
366 let padding = dict_config.padding.as_ref().and_then(|s| s.chars().next());
367 let mut builder = Dictionary::builder()
368 .chars(chars)
369 .mode(dict_config.mode.clone());
370 if let Some(p) = padding {
371 builder = builder.padding(p);
372 }
373 let dict = builder.build().unwrap();
374
375 let data = b"Hello, World!";
376 let encoded = encode(data, &dict);
377
378 let detector = DictionaryDetector::new(&config).unwrap();
379 let matches = detector.detect(&encoded);
380
381 assert!(!matches.is_empty());
382 assert!(matches[0].name == "base64" || matches[0].name == "base64url");
384 }
385
386 #[test]
387 fn test_detect_empty_input() {
388 let config = DictionaryRegistry::load_default().unwrap();
389 let detector = DictionaryDetector::new(&config).unwrap();
390
391 let matches = detector.detect("");
392 assert!(matches.is_empty());
393 }
394
395 #[test]
396 fn test_detect_invalid_input() {
397 let config = DictionaryRegistry::load_default().unwrap();
398 let detector = DictionaryDetector::new(&config).unwrap();
399
400 let matches = detector.detect("こんにちは世界");
402 if !matches.is_empty() {
404 assert!(matches[0].confidence < 0.5);
405 }
406 }
407}