base_d/core/
dictionary.rs

1use crate::core::config::EncodingMode;
2use crate::simd::variants::DictionaryMetadata;
3use std::collections::HashMap;
4
5const MAX_LOOKUP_TABLE_SIZE: usize = 256;
6
7/// Represents an encoding dictionary with its characters and configuration.
8///
9/// An dictionary defines the character set and encoding mode used for converting
10/// binary data to text. Supports three modes: mathematical base conversion,
11/// chunked (RFC 4648), and byte-range mapping.
12#[derive(Debug, Clone)]
13pub struct Dictionary {
14    chars: Vec<char>,
15    char_to_index: HashMap<char, usize>,
16    // Fast lookup table for ASCII/extended ASCII characters
17    lookup_table: Option<Box<[Option<usize>; 256]>>,
18    mode: EncodingMode,
19    padding: Option<char>,
20    start_codepoint: Option<u32>,
21}
22
23impl Dictionary {
24    /// Creates a new DictionaryBuilder for constructing a Dictionary.
25    ///
26    /// # Example
27    ///
28    /// ```
29    /// use base_d::{Dictionary, EncodingMode};
30    /// let dict = Dictionary::builder()
31    ///     .chars_from_str("0123456789ABCDEF")
32    ///     .mode(EncodingMode::BaseConversion)
33    ///     .build()
34    ///     .unwrap();
35    /// ```
36    pub fn builder() -> DictionaryBuilder {
37        DictionaryBuilder::new()
38    }
39
40    /// Creates a new dictionary with default settings (BaseConversion mode, no padding).
41    ///
42    /// # Arguments
43    ///
44    /// * `chars` - Vector of characters to use in the dictionary
45    ///
46    /// # Errors
47    ///
48    /// Returns an error if the dictionary is empty or contains duplicate characters.
49    ///
50    /// # Deprecated
51    ///
52    /// Use `Dictionary::builder()` instead for more flexible configuration.
53    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
54    pub fn new(chars: Vec<char>) -> Result<Self, String> {
55        Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
56    }
57
58    /// Creates a new dictionary with specified encoding mode and optional padding.
59    ///
60    /// # Arguments
61    ///
62    /// * `chars` - Vector of characters to use in the dictionary
63    /// * `mode` - Encoding mode (BaseConversion, Chunked, or ByteRange)
64    /// * `padding` - Optional padding character (typically '=' for RFC modes)
65    ///
66    /// # Errors
67    ///
68    /// Returns an error if:
69    /// - The dictionary is empty or contains duplicates
70    /// - Chunked mode is used with a non-power-of-two dictionary size
71    ///
72    /// # Deprecated
73    ///
74    /// Use `Dictionary::builder()` instead for more flexible configuration.
75    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
76    pub fn new_with_mode(
77        chars: Vec<char>,
78        mode: EncodingMode,
79        padding: Option<char>,
80    ) -> Result<Self, String> {
81        Self::new_with_mode_and_range(chars, mode, padding, None)
82    }
83
84    /// Creates a new dictionary with full configuration including byte-range support.
85    ///
86    /// # Arguments
87    ///
88    /// * `chars` - Vector of characters (empty for ByteRange mode)
89    /// * `mode` - Encoding mode
90    /// * `padding` - Optional padding character
91    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
92    ///
93    /// # Errors
94    ///
95    /// Returns an error if configuration is invalid for the specified mode.
96    ///
97    /// # Deprecated
98    ///
99    /// Use `Dictionary::builder()` instead for more flexible configuration.
100    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
101    pub fn new_with_mode_and_range(
102        chars: Vec<char>,
103        mode: EncodingMode,
104        padding: Option<char>,
105        start_codepoint: Option<u32>,
106    ) -> Result<Self, String> {
107        // ByteRange mode doesn't need chars, just validates start_codepoint
108        if mode == EncodingMode::ByteRange {
109            if let Some(start) = start_codepoint {
110                // Validate that we can represent all 256 bytes
111                if let Some(end_codepoint) = start.checked_add(255) {
112                    if std::char::from_u32(end_codepoint).is_none() {
113                        return Err(format!(
114                            "Invalid Unicode range: {}-{}",
115                            start, end_codepoint
116                        ));
117                    }
118                    // Validate all codepoints in range are valid Unicode
119                    for offset in 0..=255 {
120                        if std::char::from_u32(start + offset).is_none() {
121                            return Err(format!(
122                                "Invalid Unicode codepoint in range: {}",
123                                start + offset
124                            ));
125                        }
126                    }
127                } else {
128                    return Err("Start codepoint too high for 256-byte range".to_string());
129                }
130
131                return Ok(Dictionary {
132                    chars: Vec::new(),
133                    char_to_index: HashMap::new(),
134                    lookup_table: None,
135                    mode,
136                    padding,
137                    start_codepoint: Some(start),
138                });
139            } else {
140                return Err("ByteRange mode requires start_codepoint".to_string());
141            }
142        }
143
144        if chars.is_empty() {
145            return Err("Dictionary cannot be empty".to_string());
146        }
147
148        // Validate dictionary size for chunked mode
149        if mode == EncodingMode::Chunked {
150            let base = chars.len();
151            if !base.is_power_of_two() {
152                return Err(format!(
153                    "Chunked mode requires power-of-two dictionary size, got {}",
154                    base
155                ));
156            }
157            // Additional check: ensure we have valid sizes for chunked mode
158            if base != 2
159                && base != 4
160                && base != 8
161                && base != 16
162                && base != 32
163                && base != 64
164                && base != 128
165                && base != 256
166            {
167                return Err(format!("Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
168            }
169        }
170
171        // Validate character properties
172        let mut char_to_index = HashMap::new();
173        for (i, &c) in chars.iter().enumerate() {
174            // Check for duplicate characters
175            if char_to_index.insert(c, i).is_some() {
176                return Err(format!(
177                    "Duplicate character in dictionary: '{}' (U+{:04X})",
178                    c, c as u32
179                ));
180            }
181
182            // Check for invalid Unicode characters
183            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
184                return Err(format!(
185                    "Control character not allowed in dictionary: U+{:04X}",
186                    c as u32
187                ));
188            }
189
190            // Check for whitespace (except in specific cases)
191            if c.is_whitespace() {
192                return Err(format!(
193                    "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
194                    c, c as u32
195                ));
196            }
197        }
198
199        // Validate padding character if present
200        if let Some(pad) = padding {
201            if char_to_index.contains_key(&pad) {
202                return Err(format!(
203                    "Padding character '{}' conflicts with dictionary characters",
204                    pad
205                ));
206            }
207            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
208                return Err(format!(
209                    "Control character not allowed as padding: U+{:04X}",
210                    pad as u32
211                ));
212            }
213        }
214
215        // Build fast lookup table for ASCII characters
216        let lookup_table = if chars
217            .iter()
218            .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
219        {
220            let mut table = Box::new([None; 256]);
221            for (i, &c) in chars.iter().enumerate() {
222                table[c as usize] = Some(i);
223            }
224            Some(table)
225        } else {
226            None
227        };
228
229        Ok(Dictionary {
230            chars,
231            char_to_index,
232            lookup_table,
233            mode,
234            padding,
235            start_codepoint: None,
236        })
237    }
238
239    /// Creates an dictionary from a string of characters.
240    ///
241    /// # Arguments
242    ///
243    /// * `s` - String containing the dictionary characters
244    ///
245    /// # Deprecated
246    ///
247    /// Use `Dictionary::builder().chars_from_str(s).build()` instead.
248    #[deprecated(
249        since = "0.1.0",
250        note = "Use Dictionary::builder().chars_from_str(s).build() instead"
251    )]
252    pub fn from_str(s: &str) -> Result<Self, String> {
253        let chars: Vec<char> = s.chars().collect();
254        Self::new(chars)
255    }
256
257    /// Returns the base (radix) of the dictionary.
258    ///
259    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
260    pub fn base(&self) -> usize {
261        match self.mode {
262            EncodingMode::ByteRange => 256,
263            _ => self.chars.len(),
264        }
265    }
266
267    /// Returns the encoding mode of this dictionary.
268    pub fn mode(&self) -> &EncodingMode {
269        &self.mode
270    }
271
272    /// Returns the padding character, if any.
273    pub fn padding(&self) -> Option<char> {
274        self.padding
275    }
276
277    /// Returns the starting Unicode codepoint for ByteRange mode.
278    pub fn start_codepoint(&self) -> Option<u32> {
279        self.start_codepoint
280    }
281
282    /// Encodes a digit (0 to base-1) as a character.
283    ///
284    /// Returns `None` if the digit is out of range.
285    pub fn encode_digit(&self, digit: usize) -> Option<char> {
286        match self.mode {
287            EncodingMode::ByteRange => {
288                if let Some(start) = self.start_codepoint {
289                    if digit < 256 {
290                        return std::char::from_u32(start + digit as u32);
291                    }
292                }
293                None
294            }
295            _ => self.chars.get(digit).copied(),
296        }
297    }
298
299    /// Decodes a character back to its digit value.
300    ///
301    /// Returns `None` if the character is not in the dictionary.
302    pub fn decode_char(&self, c: char) -> Option<usize> {
303        match self.mode {
304            EncodingMode::ByteRange => {
305                if let Some(start) = self.start_codepoint {
306                    let codepoint = c as u32;
307                    if codepoint >= start && codepoint < start + 256 {
308                        return Some((codepoint - start) as usize);
309                    }
310                }
311                None
312            }
313            _ => {
314                // Use fast lookup table for ASCII characters
315                if let Some(ref table) = self.lookup_table {
316                    let char_val = c as u32;
317                    if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
318                        return table[char_val as usize];
319                    }
320                }
321                // Fall back to HashMap for non-ASCII
322                self.char_to_index.get(&c).copied()
323            }
324        }
325    }
326
327    /// Returns SIMD metadata for this dictionary.
328    ///
329    /// This provides information about whether SIMD acceleration is available
330    /// for this dictionary and which implementation to use.
331    pub fn simd_metadata(&self) -> DictionaryMetadata {
332        DictionaryMetadata::from_dictionary(self)
333    }
334
335    /// Returns whether SIMD acceleration is available for this dictionary.
336    ///
337    /// This is a convenience method that checks if SIMD can be used with
338    /// the current CPU features and dictionary configuration.
339    pub fn simd_available(&self) -> bool {
340        self.simd_metadata().simd_available()
341    }
342}
343
344/// Builder for constructing a Dictionary with flexible configuration.
345///
346/// # Example
347///
348/// ```
349/// use base_d::{Dictionary, EncodingMode};
350/// let dict = Dictionary::builder()
351///     .chars_from_str("0123456789ABCDEF")
352///     .mode(EncodingMode::BaseConversion)
353///     .build()
354///     .unwrap();
355/// ```
356#[derive(Debug, Default)]
357pub struct DictionaryBuilder {
358    chars: Option<Vec<char>>,
359    mode: Option<EncodingMode>,
360    padding: Option<char>,
361    start_codepoint: Option<u32>,
362}
363
364impl DictionaryBuilder {
365    /// Creates a new DictionaryBuilder with default settings.
366    pub fn new() -> Self {
367        Self {
368            chars: None,
369            mode: None,
370            padding: None,
371            start_codepoint: None,
372        }
373    }
374
375    /// Sets the dictionary characters from a vector.
376    ///
377    /// # Arguments
378    ///
379    /// * `chars` - Vector of characters to use in the dictionary
380    pub fn chars(mut self, chars: Vec<char>) -> Self {
381        self.chars = Some(chars);
382        self
383    }
384
385    /// Sets the dictionary characters from a string.
386    ///
387    /// # Arguments
388    ///
389    /// * `s` - String containing the dictionary characters
390    pub fn chars_from_str(mut self, s: &str) -> Self {
391        self.chars = Some(s.chars().collect());
392        self
393    }
394
395    /// Sets the encoding mode.
396    ///
397    /// # Arguments
398    ///
399    /// * `mode` - Encoding mode (BaseConversion, Chunked, or ByteRange)
400    pub fn mode(mut self, mode: EncodingMode) -> Self {
401        self.mode = Some(mode);
402        self
403    }
404
405    /// Sets the padding character.
406    ///
407    /// # Arguments
408    ///
409    /// * `padding` - Padding character (typically '=' for RFC modes)
410    pub fn padding(mut self, padding: char) -> Self {
411        self.padding = Some(padding);
412        self
413    }
414
415    /// Sets the starting Unicode codepoint for ByteRange mode.
416    ///
417    /// # Arguments
418    ///
419    /// * `start_codepoint` - Starting Unicode codepoint
420    pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
421        self.start_codepoint = Some(start_codepoint);
422        self
423    }
424
425    /// Builds the Dictionary with the configured settings.
426    ///
427    /// # Errors
428    ///
429    /// Returns an error if:
430    /// - The configuration is invalid for the specified mode
431    /// - Required fields are missing
432    /// - Validation fails (duplicates, invalid characters, etc.)
433    pub fn build(self) -> Result<Dictionary, String> {
434        let mode = self.mode.unwrap_or(EncodingMode::BaseConversion);
435        let chars = self.chars.unwrap_or_default();
436
437        #[allow(deprecated)]
438        Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
439    }
440}
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445
446    #[test]
447    fn test_duplicate_character_detection() {
448        let chars = vec!['a', 'b', 'c', 'a'];
449        let result = Dictionary::new(chars);
450        assert!(result.is_err());
451        assert!(result.unwrap_err().contains("Duplicate character"));
452    }
453
454    #[test]
455    fn test_empty_dictionary() {
456        let chars = vec![];
457        let result = Dictionary::new(chars);
458        assert!(result.is_err());
459        assert!(result.unwrap_err().contains("cannot be empty"));
460    }
461
462    #[test]
463    fn test_chunked_mode_power_of_two() {
464        let chars = vec!['a', 'b', 'c']; // 3 is not power of 2
465        let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
466        assert!(result.is_err());
467        assert!(result.unwrap_err().contains("power-of-two"));
468    }
469
470    #[test]
471    fn test_chunked_mode_valid_sizes() {
472        // Test all valid chunked sizes
473        for &size in &[2, 4, 8, 16, 32, 64] {
474            let chars: Vec<char> = (0..size)
475                .map(|i| {
476                    // Use a wider range of Unicode characters
477                    char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
478                })
479                .collect();
480            let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
481            assert!(result.is_ok(), "Size {} should be valid", size);
482        }
483    }
484
485    #[test]
486    fn test_control_character_rejection() {
487        let chars = vec!['a', 'b', '\x00', 'c']; // null character
488        let result = Dictionary::new(chars);
489        assert!(result.is_err());
490        assert!(result.unwrap_err().contains("Control character"));
491    }
492
493    #[test]
494    fn test_whitespace_rejection() {
495        let chars = vec!['a', 'b', ' ', 'c'];
496        let result = Dictionary::new(chars);
497        assert!(result.is_err());
498        assert!(result.unwrap_err().contains("Whitespace"));
499    }
500
501    #[test]
502    fn test_padding_conflict_with_dictionary() {
503        let chars = vec!['a', 'b', 'c', 'd'];
504        let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
505        assert!(result.is_err());
506        let err = result.unwrap_err();
507        assert!(err.contains("Padding character"));
508        assert!(err.contains("conflicts"));
509    }
510
511    #[test]
512    fn test_valid_padding() {
513        let chars = vec!['a', 'b', 'c', 'd'];
514        let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
515        assert!(result.is_ok());
516    }
517
518    #[test]
519    fn test_byte_range_exceeds_unicode() {
520        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
521        let result = Dictionary::new_with_mode_and_range(
522            Vec::new(),
523            EncodingMode::ByteRange,
524            None,
525            Some(0x10FF80), // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
526        );
527        assert!(result.is_err());
528    }
529
530    #[test]
531    fn test_byte_range_valid_start() {
532        let result = Dictionary::new_with_mode_and_range(
533            Vec::new(),
534            EncodingMode::ByteRange,
535            None,
536            Some(0x1F300), // Valid start in emoji range
537        );
538        assert!(result.is_ok());
539    }
540
541    #[test]
542    fn test_byte_range_no_start_codepoint() {
543        let result =
544            Dictionary::new_with_mode_and_range(Vec::new(), EncodingMode::ByteRange, None, None);
545        assert!(result.is_err());
546        assert!(result.unwrap_err().contains("requires start_codepoint"));
547    }
548
549    #[test]
550    fn test_detailed_error_messages() {
551        // Test that error messages include useful information
552        let chars = vec!['a', 'b', 'a'];
553        let err = Dictionary::new(chars).unwrap_err();
554        assert!(err.contains("'a'") || err.contains("U+"));
555    }
556
557    // DictionaryBuilder tests
558    #[test]
559    fn test_builder_basic() {
560        let dict = Dictionary::builder()
561            .chars(vec!['0', '1', '2', '3'])
562            .build()
563            .unwrap();
564
565        assert_eq!(dict.base(), 4);
566        assert_eq!(dict.mode(), &EncodingMode::BaseConversion);
567        assert_eq!(dict.padding(), None);
568    }
569
570    #[test]
571    fn test_builder_from_str() {
572        let dict = Dictionary::builder()
573            .chars_from_str("0123456789ABCDEF")
574            .build()
575            .unwrap();
576
577        assert_eq!(dict.base(), 16);
578    }
579
580    #[test]
581    fn test_builder_with_mode() {
582        let dict = Dictionary::builder()
583            .chars(vec!['0', '1'])
584            .mode(EncodingMode::Chunked)
585            .build()
586            .unwrap();
587
588        assert_eq!(dict.mode(), &EncodingMode::Chunked);
589    }
590
591    #[test]
592    fn test_builder_with_padding() {
593        let dict = Dictionary::builder()
594            .chars_from_str("ABCD")
595            .padding('=')
596            .build()
597            .unwrap();
598
599        assert_eq!(dict.padding(), Some('='));
600    }
601
602    #[test]
603    fn test_builder_byte_range() {
604        let dict = Dictionary::builder()
605            .mode(EncodingMode::ByteRange)
606            .start_codepoint(0x1F300)
607            .build()
608            .unwrap();
609
610        assert_eq!(dict.mode(), &EncodingMode::ByteRange);
611        assert_eq!(dict.start_codepoint(), Some(0x1F300));
612        assert_eq!(dict.base(), 256);
613    }
614
615    #[test]
616    fn test_builder_byte_range_missing_start() {
617        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
618
619        assert!(result.is_err());
620        assert!(result.unwrap_err().contains("requires start_codepoint"));
621    }
622
623    #[test]
624    fn test_builder_validation_duplicates() {
625        let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
626
627        assert!(result.is_err());
628        assert!(result.unwrap_err().contains("Duplicate character"));
629    }
630
631    #[test]
632    fn test_builder_chunked_validation() {
633        let result = Dictionary::builder()
634            .chars(vec!['a', 'b', 'c']) // 3 is not power of 2
635            .mode(EncodingMode::Chunked)
636            .build();
637
638        assert!(result.is_err());
639        assert!(result.unwrap_err().contains("power-of-two"));
640    }
641
642    #[test]
643    fn test_builder_padding_conflict() {
644        let result = Dictionary::builder()
645            .chars(vec!['a', 'b', 'c'])
646            .padding('b')
647            .build();
648
649        assert!(result.is_err());
650        assert!(result.unwrap_err().contains("Padding character"));
651    }
652
653    #[test]
654    fn test_builder_full_config() {
655        let dict = Dictionary::builder()
656            .chars_from_str("01")
657            .mode(EncodingMode::Chunked)
658            .padding('=')
659            .build()
660            .unwrap();
661
662        assert_eq!(dict.base(), 2);
663        assert_eq!(dict.mode(), &EncodingMode::Chunked);
664        assert_eq!(dict.padding(), Some('='));
665    }
666}