base_d/core/
dictionary.rs

1use crate::core::config::EncodingMode;
2use crate::simd::variants::DictionaryMetadata;
3use std::collections::HashMap;
4
5const MAX_LOOKUP_TABLE_SIZE: usize = 256;
6
7/// Represents an encoding dictionary with its characters and configuration.
8///
9/// An dictionary defines the character set and encoding mode used for converting
10/// binary data to text. Supports three modes: mathematical base conversion,
11/// chunked (RFC 4648), and byte-range mapping.
12#[derive(Debug, Clone)]
13pub struct Dictionary {
14    chars: Vec<char>,
15    char_to_index: HashMap<char, usize>,
16    // Fast lookup table for ASCII/extended ASCII characters
17    lookup_table: Option<Box<[Option<usize>; 256]>>,
18    mode: EncodingMode,
19    padding: Option<char>,
20    start_codepoint: Option<u32>,
21}
22
23impl Dictionary {
24    /// Creates a new DictionaryBuilder for constructing a Dictionary.
25    ///
26    /// # Example
27    ///
28    /// ```
29    /// use base_d::{Dictionary, EncodingMode};
30    /// let dict = Dictionary::builder()
31    ///     .chars_from_str("0123456789ABCDEF")
32    ///     .mode(EncodingMode::Radix)
33    ///     .build()
34    ///     .unwrap();
35    /// ```
36    pub fn builder() -> DictionaryBuilder {
37        DictionaryBuilder::new()
38    }
39
40    /// Creates a new dictionary with default settings (Radix mode, no padding).
41    ///
42    /// # Arguments
43    ///
44    /// * `chars` - Vector of characters to use in the dictionary
45    ///
46    /// # Errors
47    ///
48    /// Returns an error if the dictionary is empty or contains duplicate characters.
49    ///
50    /// # Deprecated
51    ///
52    /// Use `Dictionary::builder()` instead for more flexible configuration.
53    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
54    #[allow(deprecated)]
55    pub fn new(chars: Vec<char>) -> Result<Self, String> {
56        Self::new_with_mode(chars, EncodingMode::Radix, None)
57    }
58
59    /// Creates a new dictionary with specified encoding mode and optional padding.
60    ///
61    /// # Arguments
62    ///
63    /// * `chars` - Vector of characters to use in the dictionary
64    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
65    /// * `padding` - Optional padding character (typically '=' for RFC modes)
66    ///
67    /// # Errors
68    ///
69    /// Returns an error if:
70    /// - The dictionary is empty or contains duplicates
71    /// - Chunked mode is used with a non-power-of-two dictionary size
72    ///
73    /// # Deprecated
74    ///
75    /// Use `Dictionary::builder()` instead for more flexible configuration.
76    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
77    #[allow(deprecated)]
78    pub fn new_with_mode(
79        chars: Vec<char>,
80        mode: EncodingMode,
81        padding: Option<char>,
82    ) -> Result<Self, String> {
83        Self::new_with_mode_and_range(chars, mode, padding, None)
84    }
85
86    /// Creates a new dictionary with full configuration including byte-range support.
87    ///
88    /// # Arguments
89    ///
90    /// * `chars` - Vector of characters (empty for ByteRange mode)
91    /// * `mode` - Encoding mode
92    /// * `padding` - Optional padding character
93    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
94    ///
95    /// # Errors
96    ///
97    /// Returns an error if configuration is invalid for the specified mode.
98    ///
99    /// # Deprecated
100    ///
101    /// Use `Dictionary::builder()` instead for more flexible configuration.
102    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
103    pub fn new_with_mode_and_range(
104        chars: Vec<char>,
105        mode: EncodingMode,
106        padding: Option<char>,
107        start_codepoint: Option<u32>,
108    ) -> Result<Self, String> {
109        // ByteRange mode doesn't need chars, just validates start_codepoint
110        if mode == EncodingMode::ByteRange {
111            if let Some(start) = start_codepoint {
112                // Validate that we can represent all 256 bytes
113                if let Some(end_codepoint) = start.checked_add(255) {
114                    if std::char::from_u32(end_codepoint).is_none() {
115                        return Err(format!(
116                            "Invalid Unicode range: {}-{}",
117                            start, end_codepoint
118                        ));
119                    }
120                    // Validate all codepoints in range are valid Unicode
121                    for offset in 0..=255 {
122                        if std::char::from_u32(start + offset).is_none() {
123                            return Err(format!(
124                                "Invalid Unicode codepoint in range: {}",
125                                start + offset
126                            ));
127                        }
128                    }
129                } else {
130                    return Err("Start codepoint too high for 256-byte range".to_string());
131                }
132
133                return Ok(Dictionary {
134                    chars: Vec::new(),
135                    char_to_index: HashMap::new(),
136                    lookup_table: None,
137                    mode,
138                    padding,
139                    start_codepoint: Some(start),
140                });
141            } else {
142                return Err("ByteRange mode requires start_codepoint".to_string());
143            }
144        }
145
146        if chars.is_empty() {
147            return Err("Dictionary cannot be empty".to_string());
148        }
149
150        // Validate dictionary size for chunked mode
151        if mode == EncodingMode::Chunked {
152            let base = chars.len();
153            if !base.is_power_of_two() {
154                return Err(format!(
155                    "Chunked mode requires power-of-two dictionary size, got {}",
156                    base
157                ));
158            }
159            // Additional check: ensure we have valid sizes for chunked mode
160            if base != 2
161                && base != 4
162                && base != 8
163                && base != 16
164                && base != 32
165                && base != 64
166                && base != 128
167                && base != 256
168            {
169                return Err(format!(
170                    "Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
171                    base
172                ));
173            }
174        }
175
176        // Validate character properties
177        let mut char_to_index = HashMap::new();
178        for (i, &c) in chars.iter().enumerate() {
179            // Check for duplicate characters
180            if char_to_index.insert(c, i).is_some() {
181                return Err(format!(
182                    "Duplicate character in dictionary: '{}' (U+{:04X})",
183                    c, c as u32
184                ));
185            }
186
187            // Check for invalid Unicode characters
188            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
189                return Err(format!(
190                    "Control character not allowed in dictionary: U+{:04X}",
191                    c as u32
192                ));
193            }
194
195            // Check for whitespace (except in specific cases)
196            if c.is_whitespace() {
197                return Err(format!(
198                    "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
199                    c, c as u32
200                ));
201            }
202        }
203
204        // Validate padding character if present
205        if let Some(pad) = padding {
206            if char_to_index.contains_key(&pad) {
207                return Err(format!(
208                    "Padding character '{}' conflicts with dictionary characters",
209                    pad
210                ));
211            }
212            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
213                return Err(format!(
214                    "Control character not allowed as padding: U+{:04X}",
215                    pad as u32
216                ));
217            }
218        }
219
220        // Build fast lookup table for ASCII characters
221        let lookup_table = if chars
222            .iter()
223            .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
224        {
225            let mut table = Box::new([None; 256]);
226            for (i, &c) in chars.iter().enumerate() {
227                table[c as usize] = Some(i);
228            }
229            Some(table)
230        } else {
231            None
232        };
233
234        Ok(Dictionary {
235            chars,
236            char_to_index,
237            lookup_table,
238            mode,
239            padding,
240            start_codepoint: None,
241        })
242    }
243
244    /// Creates an dictionary from a string of characters.
245    ///
246    /// # Arguments
247    ///
248    /// * `s` - String containing the dictionary characters
249    ///
250    /// # Deprecated
251    ///
252    /// Use `Dictionary::builder().chars_from_str(s).build()` instead.
253    #[deprecated(
254        since = "0.1.0",
255        note = "Use Dictionary::builder().chars_from_str(s).build() instead"
256    )]
257    #[allow(deprecated, clippy::should_implement_trait)]
258    pub fn from_str(s: &str) -> Result<Self, String> {
259        let chars: Vec<char> = s.chars().collect();
260        Self::new(chars)
261    }
262
263    /// Returns the base (radix) of the dictionary.
264    ///
265    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
266    pub fn base(&self) -> usize {
267        match self.mode {
268            EncodingMode::ByteRange => 256,
269            _ => self.chars.len(),
270        }
271    }
272
273    /// Returns the encoding mode of this dictionary.
274    pub fn mode(&self) -> &EncodingMode {
275        &self.mode
276    }
277
278    /// Returns the padding character, if any.
279    pub fn padding(&self) -> Option<char> {
280        self.padding
281    }
282
283    /// Returns the starting Unicode codepoint for ByteRange mode.
284    pub fn start_codepoint(&self) -> Option<u32> {
285        self.start_codepoint
286    }
287
288    /// Encodes a digit (0 to base-1) as a character.
289    ///
290    /// Returns `None` if the digit is out of range.
291    pub fn encode_digit(&self, digit: usize) -> Option<char> {
292        match self.mode {
293            EncodingMode::ByteRange => {
294                if let Some(start) = self.start_codepoint
295                    && digit < 256
296                {
297                    return std::char::from_u32(start + digit as u32);
298                }
299                None
300            }
301            _ => self.chars.get(digit).copied(),
302        }
303    }
304
305    /// Decodes a character back to its digit value.
306    ///
307    /// Returns `None` if the character is not in the dictionary.
308    pub fn decode_char(&self, c: char) -> Option<usize> {
309        match self.mode {
310            EncodingMode::ByteRange => {
311                if let Some(start) = self.start_codepoint {
312                    let codepoint = c as u32;
313                    if codepoint >= start && codepoint < start + 256 {
314                        return Some((codepoint - start) as usize);
315                    }
316                }
317                None
318            }
319            _ => {
320                // Use fast lookup table for ASCII characters
321                if let Some(ref table) = self.lookup_table {
322                    let char_val = c as u32;
323                    if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
324                        return table[char_val as usize];
325                    }
326                }
327                // Fall back to HashMap for non-ASCII
328                self.char_to_index.get(&c).copied()
329            }
330        }
331    }
332
333    /// Returns SIMD metadata for this dictionary.
334    ///
335    /// This provides information about whether SIMD acceleration is available
336    /// for this dictionary and which implementation to use.
337    pub fn simd_metadata(&self) -> DictionaryMetadata {
338        DictionaryMetadata::from_dictionary(self)
339    }
340
341    /// Returns whether SIMD acceleration is available for this dictionary.
342    ///
343    /// This is a convenience method that checks if SIMD can be used with
344    /// the current CPU features and dictionary configuration.
345    pub fn simd_available(&self) -> bool {
346        self.simd_metadata().simd_available()
347    }
348}
349
350/// Builder for constructing a Dictionary with flexible configuration.
351///
352/// # Example
353///
354/// ```
355/// use base_d::{Dictionary, EncodingMode};
356/// let dict = Dictionary::builder()
357///     .chars_from_str("0123456789ABCDEF")
358///     .mode(EncodingMode::Radix)
359///     .build()
360///     .unwrap();
361/// ```
362#[derive(Debug, Default)]
363pub struct DictionaryBuilder {
364    chars: Option<Vec<char>>,
365    mode: Option<EncodingMode>,
366    padding: Option<char>,
367    start_codepoint: Option<u32>,
368}
369
370impl DictionaryBuilder {
371    /// Creates a new DictionaryBuilder with default settings.
372    pub fn new() -> Self {
373        Self {
374            chars: None,
375            mode: None,
376            padding: None,
377            start_codepoint: None,
378        }
379    }
380
381    /// Sets the dictionary characters from a vector.
382    ///
383    /// # Arguments
384    ///
385    /// * `chars` - Vector of characters to use in the dictionary
386    pub fn chars(mut self, chars: Vec<char>) -> Self {
387        self.chars = Some(chars);
388        self
389    }
390
391    /// Sets the dictionary characters from a string.
392    ///
393    /// # Arguments
394    ///
395    /// * `s` - String containing the dictionary characters
396    pub fn chars_from_str(mut self, s: &str) -> Self {
397        self.chars = Some(s.chars().collect());
398        self
399    }
400
401    /// Sets the encoding mode.
402    ///
403    /// # Arguments
404    ///
405    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
406    pub fn mode(mut self, mode: EncodingMode) -> Self {
407        self.mode = Some(mode);
408        self
409    }
410
411    /// Sets the padding character.
412    ///
413    /// # Arguments
414    ///
415    /// * `padding` - Padding character (typically '=' for RFC modes)
416    pub fn padding(mut self, padding: char) -> Self {
417        self.padding = Some(padding);
418        self
419    }
420
421    /// Sets the starting Unicode codepoint for ByteRange mode.
422    ///
423    /// # Arguments
424    ///
425    /// * `start_codepoint` - Starting Unicode codepoint
426    pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
427        self.start_codepoint = Some(start_codepoint);
428        self
429    }
430
431    /// Builds the Dictionary with the configured settings.
432    ///
433    /// # Errors
434    ///
435    /// Returns an error if:
436    /// - The configuration is invalid for the specified mode
437    /// - Required fields are missing
438    /// - Validation fails (duplicates, invalid characters, etc.)
439    #[allow(deprecated)]
440    pub fn build(self) -> Result<Dictionary, String> {
441        let mode = self.mode.unwrap_or(EncodingMode::Radix);
442        let chars = self.chars.unwrap_or_default();
443
444        Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
445    }
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451
452    #[test]
453    fn test_duplicate_character_detection() {
454        let chars = vec!['a', 'b', 'c', 'a'];
455        let result = Dictionary::builder().chars(chars).build();
456        assert!(result.is_err());
457        assert!(result.unwrap_err().contains("Duplicate character"));
458    }
459
460    #[test]
461    fn test_empty_dictionary() {
462        let chars = vec![];
463        let result = Dictionary::builder().chars(chars).build();
464        assert!(result.is_err());
465        assert!(result.unwrap_err().contains("cannot be empty"));
466    }
467
468    #[test]
469    fn test_chunked_mode_power_of_two() {
470        let chars = vec!['a', 'b', 'c']; // 3 is not power of 2
471        let result = Dictionary::builder()
472            .chars(chars)
473            .mode(EncodingMode::Chunked)
474            .build();
475        assert!(result.is_err());
476        assert!(result.unwrap_err().contains("power-of-two"));
477    }
478
479    #[test]
480    fn test_chunked_mode_valid_sizes() {
481        // Test all valid chunked sizes
482        for &size in &[2, 4, 8, 16, 32, 64] {
483            let chars: Vec<char> = (0..size)
484                .map(|i| {
485                    // Use a wider range of Unicode characters
486                    char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
487                })
488                .collect();
489            let result = Dictionary::builder()
490                .chars(chars)
491                .mode(EncodingMode::Chunked)
492                .build();
493            assert!(result.is_ok(), "Size {} should be valid", size);
494        }
495    }
496
497    #[test]
498    fn test_control_character_rejection() {
499        let chars = vec!['a', 'b', '\x00', 'c']; // null character
500        let result = Dictionary::builder().chars(chars).build();
501        assert!(result.is_err());
502        assert!(result.unwrap_err().contains("Control character"));
503    }
504
505    #[test]
506    fn test_whitespace_rejection() {
507        let chars = vec!['a', 'b', ' ', 'c'];
508        let result = Dictionary::builder().chars(chars).build();
509        assert!(result.is_err());
510        assert!(result.unwrap_err().contains("Whitespace"));
511    }
512
513    #[test]
514    fn test_padding_conflict_with_dictionary() {
515        let chars = vec!['a', 'b', 'c', 'd'];
516        let result = Dictionary::builder()
517            .chars(chars)
518            .mode(EncodingMode::Radix)
519            .padding('b')
520            .build();
521        assert!(result.is_err());
522        let err = result.unwrap_err();
523        assert!(err.contains("Padding character"));
524        assert!(err.contains("conflicts"));
525    }
526
527    #[test]
528    fn test_valid_padding() {
529        let chars = vec!['a', 'b', 'c', 'd'];
530        let result = Dictionary::builder()
531            .chars(chars)
532            .mode(EncodingMode::Radix)
533            .padding('=')
534            .build();
535        assert!(result.is_ok());
536    }
537
538    #[test]
539    fn test_byte_range_exceeds_unicode() {
540        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
541        let result = Dictionary::builder()
542            .mode(EncodingMode::ByteRange)
543            .start_codepoint(0x10FF80) // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
544            .build();
545        assert!(result.is_err());
546    }
547
548    #[test]
549    fn test_byte_range_valid_start() {
550        let result = Dictionary::builder()
551            .mode(EncodingMode::ByteRange)
552            .start_codepoint(0x1F300) // Valid start in emoji range
553            .build();
554        assert!(result.is_ok());
555    }
556
557    #[test]
558    fn test_byte_range_no_start_codepoint() {
559        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
560        assert!(result.is_err());
561        assert!(result.unwrap_err().contains("requires start_codepoint"));
562    }
563
564    #[test]
565    fn test_detailed_error_messages() {
566        // Test that error messages include useful information
567        let chars = vec!['a', 'b', 'a'];
568        let err = Dictionary::builder().chars(chars).build().unwrap_err();
569        assert!(err.contains("'a'") || err.contains("U+"));
570    }
571
572    // DictionaryBuilder tests
573    #[test]
574    fn test_builder_basic() {
575        let dict = Dictionary::builder()
576            .chars(vec!['0', '1', '2', '3'])
577            .build()
578            .unwrap();
579
580        assert_eq!(dict.base(), 4);
581        assert_eq!(dict.mode(), &EncodingMode::Radix);
582        assert_eq!(dict.padding(), None);
583    }
584
585    #[test]
586    fn test_builder_from_str() {
587        let dict = Dictionary::builder()
588            .chars_from_str("0123456789ABCDEF")
589            .build()
590            .unwrap();
591
592        assert_eq!(dict.base(), 16);
593    }
594
595    #[test]
596    fn test_builder_with_mode() {
597        let dict = Dictionary::builder()
598            .chars(vec!['0', '1'])
599            .mode(EncodingMode::Chunked)
600            .build()
601            .unwrap();
602
603        assert_eq!(dict.mode(), &EncodingMode::Chunked);
604    }
605
606    #[test]
607    fn test_builder_with_padding() {
608        let dict = Dictionary::builder()
609            .chars_from_str("ABCD")
610            .padding('=')
611            .build()
612            .unwrap();
613
614        assert_eq!(dict.padding(), Some('='));
615    }
616
617    #[test]
618    fn test_builder_byte_range() {
619        let dict = Dictionary::builder()
620            .mode(EncodingMode::ByteRange)
621            .start_codepoint(0x1F300)
622            .build()
623            .unwrap();
624
625        assert_eq!(dict.mode(), &EncodingMode::ByteRange);
626        assert_eq!(dict.start_codepoint(), Some(0x1F300));
627        assert_eq!(dict.base(), 256);
628    }
629
630    #[test]
631    fn test_builder_byte_range_missing_start() {
632        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
633
634        assert!(result.is_err());
635        assert!(result.unwrap_err().contains("requires start_codepoint"));
636    }
637
638    #[test]
639    fn test_builder_validation_duplicates() {
640        let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
641
642        assert!(result.is_err());
643        assert!(result.unwrap_err().contains("Duplicate character"));
644    }
645
646    #[test]
647    fn test_builder_chunked_validation() {
648        let result = Dictionary::builder()
649            .chars(vec!['a', 'b', 'c']) // 3 is not power of 2
650            .mode(EncodingMode::Chunked)
651            .build();
652
653        assert!(result.is_err());
654        assert!(result.unwrap_err().contains("power-of-two"));
655    }
656
657    #[test]
658    fn test_builder_padding_conflict() {
659        let result = Dictionary::builder()
660            .chars(vec!['a', 'b', 'c'])
661            .padding('b')
662            .build();
663
664        assert!(result.is_err());
665        assert!(result.unwrap_err().contains("Padding character"));
666    }
667
668    #[test]
669    fn test_builder_full_config() {
670        let dict = Dictionary::builder()
671            .chars_from_str("01")
672            .mode(EncodingMode::Chunked)
673            .padding('=')
674            .build()
675            .unwrap();
676
677        assert_eq!(dict.base(), 2);
678        assert_eq!(dict.mode(), &EncodingMode::Chunked);
679        assert_eq!(dict.padding(), Some('='));
680    }
681}