base_d/core/
dictionary.rs

1use crate::core::config::EncodingMode;
2use crate::simd::alphabets::AlphabetMetadata;
3use std::collections::HashMap;
4
5const MAX_LOOKUP_TABLE_SIZE: usize = 256;
6
7/// Represents an encoding dictionary with its characters and configuration.
8///
9/// An dictionary defines the character set and encoding mode used for converting
10/// binary data to text. Supports three modes: mathematical base conversion,
11/// chunked (RFC 4648), and byte-range mapping.
12#[derive(Debug, Clone)]
13pub struct Dictionary {
14    chars: Vec<char>,
15    char_to_index: HashMap<char, usize>,
16    // Fast lookup table for ASCII/extended ASCII characters
17    lookup_table: Option<Box<[Option<usize>; 256]>>,
18    mode: EncodingMode,
19    padding: Option<char>,
20    start_codepoint: Option<u32>,
21}
22
23impl Dictionary {
24    /// Creates a new dictionary with default settings (BaseConversion mode, no padding).
25    ///
26    /// # Arguments
27    ///
28    /// * `chars` - Vector of characters to use in the dictionary
29    ///
30    /// # Errors
31    ///
32    /// Returns an error if the dictionary is empty or contains duplicate characters.
33    pub fn new(chars: Vec<char>) -> Result<Self, String> {
34        Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
35    }
36
37    /// Creates a new dictionary with specified encoding mode and optional padding.
38    ///
39    /// # Arguments
40    ///
41    /// * `chars` - Vector of characters to use in the dictionary
42    /// * `mode` - Encoding mode (BaseConversion, Chunked, or ByteRange)
43    /// * `padding` - Optional padding character (typically '=' for RFC modes)
44    ///
45    /// # Errors
46    ///
47    /// Returns an error if:
48    /// - The dictionary is empty or contains duplicates
49    /// - Chunked mode is used with a non-power-of-two dictionary size
50    pub fn new_with_mode(
51        chars: Vec<char>,
52        mode: EncodingMode,
53        padding: Option<char>,
54    ) -> Result<Self, String> {
55        Self::new_with_mode_and_range(chars, mode, padding, None)
56    }
57
58    /// Creates a new dictionary with full configuration including byte-range support.
59    ///
60    /// # Arguments
61    ///
62    /// * `chars` - Vector of characters (empty for ByteRange mode)
63    /// * `mode` - Encoding mode
64    /// * `padding` - Optional padding character
65    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
66    ///
67    /// # Errors
68    ///
69    /// Returns an error if configuration is invalid for the specified mode.
70    pub fn new_with_mode_and_range(
71        chars: Vec<char>,
72        mode: EncodingMode,
73        padding: Option<char>,
74        start_codepoint: Option<u32>,
75    ) -> Result<Self, String> {
76        // ByteRange mode doesn't need chars, just validates start_codepoint
77        if mode == EncodingMode::ByteRange {
78            if let Some(start) = start_codepoint {
79                // Validate that we can represent all 256 bytes
80                if let Some(end_codepoint) = start.checked_add(255) {
81                    if std::char::from_u32(end_codepoint).is_none() {
82                        return Err(format!(
83                            "Invalid Unicode range: {}-{}",
84                            start, end_codepoint
85                        ));
86                    }
87                    // Validate all codepoints in range are valid Unicode
88                    for offset in 0..=255 {
89                        if std::char::from_u32(start + offset).is_none() {
90                            return Err(format!(
91                                "Invalid Unicode codepoint in range: {}",
92                                start + offset
93                            ));
94                        }
95                    }
96                } else {
97                    return Err("Start codepoint too high for 256-byte range".to_string());
98                }
99
100                return Ok(Dictionary {
101                    chars: Vec::new(),
102                    char_to_index: HashMap::new(),
103                    lookup_table: None,
104                    mode,
105                    padding,
106                    start_codepoint: Some(start),
107                });
108            } else {
109                return Err("ByteRange mode requires start_codepoint".to_string());
110            }
111        }
112
113        if chars.is_empty() {
114            return Err("Dictionary cannot be empty".to_string());
115        }
116
117        // Validate dictionary size for chunked mode
118        if mode == EncodingMode::Chunked {
119            let base = chars.len();
120            if !base.is_power_of_two() {
121                return Err(format!(
122                    "Chunked mode requires power-of-two dictionary size, got {}",
123                    base
124                ));
125            }
126            // Additional check: ensure we have valid sizes for chunked mode
127            if base != 2
128                && base != 4
129                && base != 8
130                && base != 16
131                && base != 32
132                && base != 64
133                && base != 128
134                && base != 256
135            {
136                return Err(format!("Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
137            }
138        }
139
140        // Validate character properties
141        let mut char_to_index = HashMap::new();
142        for (i, &c) in chars.iter().enumerate() {
143            // Check for duplicate characters
144            if char_to_index.insert(c, i).is_some() {
145                return Err(format!(
146                    "Duplicate character in dictionary: '{}' (U+{:04X})",
147                    c, c as u32
148                ));
149            }
150
151            // Check for invalid Unicode characters
152            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
153                return Err(format!(
154                    "Control character not allowed in dictionary: U+{:04X}",
155                    c as u32
156                ));
157            }
158
159            // Check for whitespace (except in specific cases)
160            if c.is_whitespace() {
161                return Err(format!(
162                    "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
163                    c, c as u32
164                ));
165            }
166        }
167
168        // Validate padding character if present
169        if let Some(pad) = padding {
170            if char_to_index.contains_key(&pad) {
171                return Err(format!(
172                    "Padding character '{}' conflicts with dictionary characters",
173                    pad
174                ));
175            }
176            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
177                return Err(format!(
178                    "Control character not allowed as padding: U+{:04X}",
179                    pad as u32
180                ));
181            }
182        }
183
184        // Build fast lookup table for ASCII characters
185        let lookup_table = if chars
186            .iter()
187            .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
188        {
189            let mut table = Box::new([None; 256]);
190            for (i, &c) in chars.iter().enumerate() {
191                table[c as usize] = Some(i);
192            }
193            Some(table)
194        } else {
195            None
196        };
197
198        Ok(Dictionary {
199            chars,
200            char_to_index,
201            lookup_table,
202            mode,
203            padding,
204            start_codepoint: None,
205        })
206    }
207
208    /// Creates an dictionary from a string of characters.
209    ///
210    /// # Arguments
211    ///
212    /// * `s` - String containing the dictionary characters
213    pub fn from_str(s: &str) -> Result<Self, String> {
214        let chars: Vec<char> = s.chars().collect();
215        Self::new(chars)
216    }
217
218    /// Returns the base (radix) of the dictionary.
219    ///
220    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
221    pub fn base(&self) -> usize {
222        match self.mode {
223            EncodingMode::ByteRange => 256,
224            _ => self.chars.len(),
225        }
226    }
227
228    /// Returns the encoding mode of this dictionary.
229    pub fn mode(&self) -> &EncodingMode {
230        &self.mode
231    }
232
233    /// Returns the padding character, if any.
234    pub fn padding(&self) -> Option<char> {
235        self.padding
236    }
237
238    /// Returns the starting Unicode codepoint for ByteRange mode.
239    pub fn start_codepoint(&self) -> Option<u32> {
240        self.start_codepoint
241    }
242
243    /// Encodes a digit (0 to base-1) as a character.
244    ///
245    /// Returns `None` if the digit is out of range.
246    pub fn encode_digit(&self, digit: usize) -> Option<char> {
247        match self.mode {
248            EncodingMode::ByteRange => {
249                if let Some(start) = self.start_codepoint {
250                    if digit < 256 {
251                        return std::char::from_u32(start + digit as u32);
252                    }
253                }
254                None
255            }
256            _ => self.chars.get(digit).copied(),
257        }
258    }
259
260    /// Decodes a character back to its digit value.
261    ///
262    /// Returns `None` if the character is not in the dictionary.
263    pub fn decode_char(&self, c: char) -> Option<usize> {
264        match self.mode {
265            EncodingMode::ByteRange => {
266                if let Some(start) = self.start_codepoint {
267                    let codepoint = c as u32;
268                    if codepoint >= start && codepoint < start + 256 {
269                        return Some((codepoint - start) as usize);
270                    }
271                }
272                None
273            }
274            _ => {
275                // Use fast lookup table for ASCII characters
276                if let Some(ref table) = self.lookup_table {
277                    let char_val = c as u32;
278                    if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
279                        return table[char_val as usize];
280                    }
281                }
282                // Fall back to HashMap for non-ASCII
283                self.char_to_index.get(&c).copied()
284            }
285        }
286    }
287
288    /// Returns SIMD metadata for this dictionary.
289    ///
290    /// This provides information about whether SIMD acceleration is available
291    /// for this dictionary and which implementation to use.
292    pub fn simd_metadata(&self) -> AlphabetMetadata {
293        AlphabetMetadata::from_dictionary(self)
294    }
295
296    /// Returns whether SIMD acceleration is available for this dictionary.
297    ///
298    /// This is a convenience method that checks if SIMD can be used with
299    /// the current CPU features and dictionary configuration.
300    pub fn simd_available(&self) -> bool {
301        self.simd_metadata().simd_available()
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn test_duplicate_character_detection() {
311        let chars = vec!['a', 'b', 'c', 'a'];
312        let result = Dictionary::new(chars);
313        assert!(result.is_err());
314        assert!(result.unwrap_err().contains("Duplicate character"));
315    }
316
317    #[test]
318    fn test_empty_alphabet() {
319        let chars = vec![];
320        let result = Dictionary::new(chars);
321        assert!(result.is_err());
322        assert!(result.unwrap_err().contains("cannot be empty"));
323    }
324
325    #[test]
326    fn test_chunked_mode_power_of_two() {
327        let chars = vec!['a', 'b', 'c']; // 3 is not power of 2
328        let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
329        assert!(result.is_err());
330        assert!(result.unwrap_err().contains("power-of-two"));
331    }
332
333    #[test]
334    fn test_chunked_mode_valid_sizes() {
335        // Test all valid chunked sizes
336        for &size in &[2, 4, 8, 16, 32, 64] {
337            let chars: Vec<char> = (0..size)
338                .map(|i| {
339                    // Use a wider range of Unicode characters
340                    char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
341                })
342                .collect();
343            let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
344            assert!(result.is_ok(), "Size {} should be valid", size);
345        }
346    }
347
348    #[test]
349    fn test_control_character_rejection() {
350        let chars = vec!['a', 'b', '\x00', 'c']; // null character
351        let result = Dictionary::new(chars);
352        assert!(result.is_err());
353        assert!(result.unwrap_err().contains("Control character"));
354    }
355
356    #[test]
357    fn test_whitespace_rejection() {
358        let chars = vec!['a', 'b', ' ', 'c'];
359        let result = Dictionary::new(chars);
360        assert!(result.is_err());
361        assert!(result.unwrap_err().contains("Whitespace"));
362    }
363
364    #[test]
365    fn test_padding_conflict_with_alphabet() {
366        let chars = vec!['a', 'b', 'c', 'd'];
367        let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
368        assert!(result.is_err());
369        let err = result.unwrap_err();
370        assert!(err.contains("Padding character"));
371        assert!(err.contains("conflicts"));
372    }
373
374    #[test]
375    fn test_valid_padding() {
376        let chars = vec!['a', 'b', 'c', 'd'];
377        let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
378        assert!(result.is_ok());
379    }
380
381    #[test]
382    fn test_byte_range_exceeds_unicode() {
383        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
384        let result = Dictionary::new_with_mode_and_range(
385            Vec::new(),
386            EncodingMode::ByteRange,
387            None,
388            Some(0x10FF80), // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
389        );
390        assert!(result.is_err());
391    }
392
393    #[test]
394    fn test_byte_range_valid_start() {
395        let result = Dictionary::new_with_mode_and_range(
396            Vec::new(),
397            EncodingMode::ByteRange,
398            None,
399            Some(0x1F300), // Valid start in emoji range
400        );
401        assert!(result.is_ok());
402    }
403
404    #[test]
405    fn test_byte_range_no_start_codepoint() {
406        let result =
407            Dictionary::new_with_mode_and_range(Vec::new(), EncodingMode::ByteRange, None, None);
408        assert!(result.is_err());
409        assert!(result.unwrap_err().contains("requires start_codepoint"));
410    }
411
412    #[test]
413    fn test_detailed_error_messages() {
414        // Test that error messages include useful information
415        let chars = vec!['a', 'b', 'a'];
416        let err = Dictionary::new(chars).unwrap_err();
417        assert!(err.contains("'a'") || err.contains("U+"));
418    }
419}
base_d/core/dictionary.rs

base_d/core/
dictionary.rs