base_d/
detection.rs

1use crate::core::config::{DictionariesConfig, EncodingMode};
2use crate::core::dictionary::Dictionary;
3use crate::decode;
4use std::collections::HashSet;
5
6/// A match result from dictionary detection.
7#[derive(Debug, Clone)]
8pub struct DictionaryMatch {
9    /// Name of the matched dictionary
10    pub name: String,
11    /// Confidence score (0.0 to 1.0)
12    pub confidence: f64,
13    /// The dictionary itself
14    pub dictionary: Dictionary,
15}
16
17/// Detector for automatically identifying which dictionary was used to encode data.
18pub struct DictionaryDetector {
19    dictionaries: Vec<(String, Dictionary)>,
20}
21
22impl DictionaryDetector {
23    /// Creates a new detector from a configuration.
24    pub fn new(config: &DictionariesConfig) -> Result<Self, Box<dyn std::error::Error>> {
25        let mut dictionaries = Vec::new();
26        
27        for (name, dict_config) in &config.dictionaries {
28            let dictionary = match dict_config.mode {
29                EncodingMode::ByteRange => {
30                    let start = dict_config.start_codepoint
31                        .ok_or("ByteRange mode requires start_codepoint")?;
32                    Dictionary::new_with_mode_and_range(
33                        Vec::new(),
34                        dict_config.mode.clone(),
35                        None,
36                        Some(start)
37                    )?
38                }
39                _ => {
40                    let chars: Vec<char> = dict_config.chars.chars().collect();
41                    let padding = dict_config.padding.as_ref().and_then(|s| s.chars().next());
42                    Dictionary::new_with_mode(chars, dict_config.mode.clone(), padding)?
43                }
44            };
45            dictionaries.push((name.clone(), dictionary));
46        }
47        
48        Ok(DictionaryDetector { dictionaries })
49    }
50    
51    /// Detect which dictionary was likely used to encode the input.
52    /// Returns matches sorted by confidence (highest first).
53    pub fn detect(&self, input: &str) -> Vec<DictionaryMatch> {
54        let input = input.trim();
55        if input.is_empty() {
56            return Vec::new();
57        }
58        
59        let mut matches = Vec::new();
60        
61        for (name, dict) in &self.dictionaries {
62            if let Some(confidence) = self.score_dictionary(input, dict) {
63                matches.push(DictionaryMatch {
64                    name: name.clone(),
65                    confidence,
66                    dictionary: dict.clone(),
67                });
68            }
69        }
70        
71        // Sort by confidence descending
72        matches.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
73        
74        matches
75    }
76    
77    /// Score how likely a dictionary matches the input.
78    /// Returns Some(confidence) if it's a plausible match, None otherwise.
79    fn score_dictionary(&self, input: &str, dict: &Dictionary) -> Option<f64> {
80        let mut score = 0.0;
81        let mut weight_sum = 0.0;
82        
83        // Weight for each scoring component
84        const CHARSET_WEIGHT: f64 = 0.25;
85        const SPECIFICITY_WEIGHT: f64 = 0.20;  // Increased
86        const PADDING_WEIGHT: f64 = 0.30;       // Increased (very important for RFC standards)
87        const LENGTH_WEIGHT: f64 = 0.15;
88        const DECODE_WEIGHT: f64 = 0.10;
89        
90        // 1. Character set matching
91        let charset_score = self.score_charset(input, dict);
92        score += charset_score * CHARSET_WEIGHT;
93        weight_sum += CHARSET_WEIGHT;
94        
95        // If character set score is too low, skip this dictionary
96        if charset_score < 0.5 {
97            return None;
98        }
99        
100        // 1.5. Specificity - does this dictionary use a focused character set?
101        let specificity_score = self.score_specificity(input, dict);
102        score += specificity_score * SPECIFICITY_WEIGHT;
103        weight_sum += SPECIFICITY_WEIGHT;
104        
105        // 2. Padding detection (for chunked modes)
106        if let Some(padding_score) = self.score_padding(input, dict) {
107            score += padding_score * PADDING_WEIGHT;
108            weight_sum += PADDING_WEIGHT;
109        }
110        
111        // 3. Length validation
112        let length_score = self.score_length(input, dict);
113        score += length_score * LENGTH_WEIGHT;
114        weight_sum += LENGTH_WEIGHT;
115        
116        // 4. Decode validation (try to actually decode)
117        if let Some(decode_score) = self.score_decode(input, dict) {
118            score += decode_score * DECODE_WEIGHT;
119            weight_sum += DECODE_WEIGHT;
120        }
121        
122        // Normalize score
123        if weight_sum > 0.0 {
124            Some(score / weight_sum)
125        } else {
126            None
127        }
128    }
129    
130    /// Score based on character set matching.
131    fn score_charset(&self, input: &str, dict: &Dictionary) -> f64 {
132        // Get all unique characters in input (excluding whitespace and padding)
133        let input_chars: HashSet<char> = input.chars()
134            .filter(|c| !c.is_whitespace() && Some(*c) != dict.padding())
135            .collect();
136        
137        if input_chars.is_empty() {
138            return 0.0;
139        }
140        
141        // For ByteRange mode, check if characters are in the expected range
142        if let Some(start) = dict.start_codepoint() {
143            let in_range = input_chars.iter()
144                .filter(|&&c| {
145                    let code = c as u32;
146                    code >= start && code < start + 256
147                })
148                .count();
149            return in_range as f64 / input_chars.len() as f64;
150        }
151        
152        // Check if all input characters are in the dictionary
153        let mut valid_count = 0;
154        for c in &input_chars {
155            if dict.decode_char(*c).is_some() {
156                valid_count += 1;
157            }
158        }
159        
160        if valid_count < input_chars.len() {
161            // Not all characters are valid - reject this dictionary
162            return 0.0;
163        }
164        
165        // All characters are valid. Now check how well the dictionary size matches
166        let dict_size = dict.base();
167        let input_unique = input_chars.len();
168        
169        // Calculate what percentage of the dictionary is actually used
170        let usage_ratio = input_unique as f64 / dict_size as f64;
171        
172        // Prefer dictionaries where we use most of the character set
173        // This helps distinguish base64 (64 chars) from base85 (85 chars)
174        if usage_ratio > 0.7 {
175            // We're using >70% of dictionary - excellent match
176            1.0
177        } else if usage_ratio > 0.5 {
178            // We're using >50% of dictionary - good match
179            0.85
180        } else if usage_ratio > 0.3 {
181            // We're using >30% of dictionary - okay match
182            0.7
183        } else {
184            // We're using <30% of dictionary - probably wrong
185            // (e.g., using 20 chars of a 85-char dictionary)
186            0.5
187        }
188    }
189    
190    /// Score based on how specific/focused the dictionary character set is.
191    /// Smaller, more focused dictionaries score higher.
192    fn score_specificity(&self, _input: &str, dict: &Dictionary) -> f64 {
193        let dict_size = dict.base();
194        
195        // Prefer smaller, more common dictionaries
196        // This helps distinguish base64 (64) from base85 (85) when both match
197        match dict_size {
198            16 => 1.0,   // hex
199            32 => 0.95,  // base32
200            58 => 0.90,  // base58
201            62 => 0.88,  // base62
202            64 => 0.92,  // base64 (very common)
203            85 => 0.70,  // base85 (less common)
204            256 => 0.60, // base256
205            _ if dict_size < 64 => 0.85,
206            _ if dict_size < 128 => 0.75,
207            _ => 0.65,
208        }
209    }
210    
211    /// Score based on padding character presence and position.
212    fn score_padding(&self, input: &str, dict: &Dictionary) -> Option<f64> {
213        let padding = dict.padding()?;
214        
215        // Chunked modes should have padding at the end (or no padding)
216        if *dict.mode() == EncodingMode::Chunked {
217            let has_padding = input.ends_with(padding);
218            let padding_count = input.chars().filter(|c| *c == padding).count();
219            
220            if has_padding {
221                // Padding should only be at the end
222                let trimmed = input.trim_end_matches(padding);
223                let internal_padding = trimmed.chars().any(|c| c == padding);
224                
225                if internal_padding {
226                    Some(0.5) // Suspicious padding in middle
227                } else if padding_count <= 3 {
228                    Some(1.0) // Valid padding
229                } else {
230                    Some(0.3) // Too much padding
231                }
232            } else {
233                // No padding is also valid for chunked mode
234                Some(0.8)
235            }
236        } else {
237            None
238        }
239    }
240    
241    /// Score based on input length validation for the encoding mode.
242    fn score_length(&self, input: &str, dict: &Dictionary) -> f64 {
243        let length = input.trim().len();
244        
245        match dict.mode() {
246            EncodingMode::Chunked => {
247                // Chunked mode should have specific alignment
248                let base = dict.base();
249                
250                // Remove padding to check alignment
251                let trimmed = if let Some(pad) = dict.padding() {
252                    input.trim_end_matches(pad)
253                } else {
254                    input
255                };
256                
257                // For base64 (6 bits per char), output should be multiple of 4
258                // For base32 (5 bits per char), output should be multiple of 8
259                // For base16 (4 bits per char), output should be multiple of 2
260                let expected_multiple = match base {
261                    64 => 4,
262                    32 => 8,
263                    16 => 2,
264                    _ => return 0.5, // Unknown chunked base
265                };
266                
267                if trimmed.len() % expected_multiple == 0 {
268                    1.0
269                } else {
270                    0.3
271                }
272            }
273            EncodingMode::ByteRange => {
274                // ByteRange is 1:1 mapping, any length is valid
275                1.0
276            }
277            EncodingMode::BaseConversion => {
278                // Mathematical conversion can produce any length
279                if length > 0 {
280                    1.0
281                } else {
282                    0.0
283                }
284            }
285        }
286    }
287    
288    /// Score based on whether the input can be successfully decoded.
289    fn score_decode(&self, input: &str, dict: &Dictionary) -> Option<f64> {
290        match decode(input, dict) {
291            Ok(decoded) => {
292                if decoded.is_empty() {
293                    Some(0.5)
294                } else {
295                    // Successfully decoded!
296                    Some(1.0)
297                }
298            }
299            Err(_) => {
300                // Failed to decode
301                Some(0.0)
302            }
303        }
304    }
305}
306
307/// Convenience function to detect dictionary from input.
308pub fn detect_dictionary(input: &str) -> Result<Vec<DictionaryMatch>, Box<dyn std::error::Error>> {
309    let config = DictionariesConfig::load_with_overrides()?;
310    let detector = DictionaryDetector::new(&config)?;
311    Ok(detector.detect(input))
312}
313
314#[cfg(test)]
315mod tests {
316    use super::*;
317    use crate::encode;
318    
319    #[test]
320    fn test_detect_base64() {
321        let config = DictionariesConfig::load_default().unwrap();
322        let detector = DictionaryDetector::new(&config).unwrap();
323        
324        // Standard base64 with padding
325        let matches = detector.detect("SGVsbG8sIFdvcmxkIQ==");
326        assert!(!matches.is_empty());
327        // base64 and base64url are very similar, so either is acceptable
328        assert!(matches[0].name == "base64" || matches[0].name == "base64url");
329        assert!(matches[0].confidence > 0.7);
330    }
331    
332    #[test]
333    fn test_detect_base32() {
334        let config = DictionariesConfig::load_default().unwrap();
335        let detector = DictionaryDetector::new(&config).unwrap();
336        
337        let matches = detector.detect("JBSWY3DPEBLW64TMMQ======");
338        assert!(!matches.is_empty());
339        // base32 should be in top 5 candidates
340        let base32_found = matches.iter().take(5).any(|m| m.name.starts_with("base32"));
341        assert!(base32_found, "base32 should be in top 5 candidates");
342    }
343    
344    #[test]
345    fn test_detect_hex() {
346        let config = DictionariesConfig::load_default().unwrap();
347        let detector = DictionaryDetector::new(&config).unwrap();
348        
349        let matches = detector.detect("48656c6c6f");
350        assert!(!matches.is_empty());
351        // hex or hex_math are both correct
352        assert!(matches[0].name == "hex" || matches[0].name == "hex_math");
353        assert!(matches[0].confidence > 0.8);
354    }
355    
356    #[test]
357    fn test_detect_from_encoded() {
358        let config = DictionariesConfig::load_default().unwrap();
359        
360        // Test with actual encoding
361        let dict_config = config.get_dictionary("base64").unwrap();
362        let chars: Vec<char> = dict_config.chars.chars().collect();
363        let padding = dict_config.padding.as_ref().and_then(|s| s.chars().next());
364        let dict = Dictionary::new_with_mode(chars, dict_config.mode.clone(), padding).unwrap();
365        
366        let data = b"Hello, World!";
367        let encoded = encode(data, &dict);
368        
369        let detector = DictionaryDetector::new(&config).unwrap();
370        let matches = detector.detect(&encoded);
371        
372        assert!(!matches.is_empty());
373        // base64 and base64url only differ by 2 chars, so both are valid
374        assert!(matches[0].name == "base64" || matches[0].name == "base64url");
375    }
376    
377    #[test]
378    fn test_detect_empty_input() {
379        let config = DictionariesConfig::load_default().unwrap();
380        let detector = DictionaryDetector::new(&config).unwrap();
381        
382        let matches = detector.detect("");
383        assert!(matches.is_empty());
384    }
385    
386    #[test]
387    fn test_detect_invalid_input() {
388        let config = DictionariesConfig::load_default().unwrap();
389        let detector = DictionaryDetector::new(&config).unwrap();
390        
391        // Input with characters not in any dictionary
392        let matches = detector.detect("こんにちは世界");
393        // Should return few or no high-confidence matches
394        if !matches.is_empty() {
395            assert!(matches[0].confidence < 0.5);
396        }
397    }
398}