base_d/core/
dictionary.rs

1use crate::core::config::EncodingMode;
2#[cfg(feature = "simd")]
3use crate::simd::variants::DictionaryMetadata;
4use std::collections::HashMap;
5
6const MAX_LOOKUP_TABLE_SIZE: usize = 256;
7
8/// Represents an encoding dictionary with its characters and configuration.
9///
10/// An dictionary defines the character set and encoding mode used for converting
11/// binary data to text. Supports three modes: mathematical base conversion,
12/// chunked (RFC 4648), and byte-range mapping.
13#[derive(Debug, Clone)]
14pub struct Dictionary {
15    chars: Vec<char>,
16    char_to_index: HashMap<char, usize>,
17    // Fast lookup table for ASCII/extended ASCII characters
18    lookup_table: Option<Box<[Option<usize>; 256]>>,
19    mode: EncodingMode,
20    padding: Option<char>,
21    start_codepoint: Option<u32>,
22}
23
24impl Dictionary {
25    /// Creates a new DictionaryBuilder for constructing a Dictionary.
26    ///
27    /// # Example
28    ///
29    /// ```
30    /// use base_d::{Dictionary, EncodingMode};
31    /// let dict = Dictionary::builder()
32    ///     .chars_from_str("0123456789ABCDEF")
33    ///     .mode(EncodingMode::Radix)
34    ///     .build()
35    ///     .unwrap();
36    /// ```
37    pub fn builder() -> DictionaryBuilder {
38        DictionaryBuilder::new()
39    }
40
41    /// Creates a new dictionary with default settings (Radix mode, no padding).
42    ///
43    /// # Arguments
44    ///
45    /// * `chars` - Vector of characters to use in the dictionary
46    ///
47    /// # Errors
48    ///
49    /// Returns an error if the dictionary is empty or contains duplicate characters.
50    ///
51    /// # Deprecated
52    ///
53    /// Use `Dictionary::builder()` instead for more flexible configuration.
54    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
55    #[allow(deprecated)]
56    pub fn new(chars: Vec<char>) -> Result<Self, String> {
57        Self::new_with_mode(chars, EncodingMode::Radix, None)
58    }
59
60    /// Creates a new dictionary with specified encoding mode and optional padding.
61    ///
62    /// # Arguments
63    ///
64    /// * `chars` - Vector of characters to use in the dictionary
65    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
66    /// * `padding` - Optional padding character (typically '=' for RFC modes)
67    ///
68    /// # Errors
69    ///
70    /// Returns an error if:
71    /// - The dictionary is empty or contains duplicates
72    /// - Chunked mode is used with a non-power-of-two dictionary size
73    ///
74    /// # Deprecated
75    ///
76    /// Use `Dictionary::builder()` instead for more flexible configuration.
77    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
78    #[allow(deprecated)]
79    pub fn new_with_mode(
80        chars: Vec<char>,
81        mode: EncodingMode,
82        padding: Option<char>,
83    ) -> Result<Self, String> {
84        Self::new_with_mode_and_range(chars, mode, padding, None)
85    }
86
87    /// Creates a new dictionary with full configuration including byte-range support.
88    ///
89    /// # Arguments
90    ///
91    /// * `chars` - Vector of characters (empty for ByteRange mode)
92    /// * `mode` - Encoding mode
93    /// * `padding` - Optional padding character
94    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
95    ///
96    /// # Errors
97    ///
98    /// Returns an error if configuration is invalid for the specified mode.
99    ///
100    /// # Deprecated
101    ///
102    /// Use `Dictionary::builder()` instead for more flexible configuration.
103    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
104    pub fn new_with_mode_and_range(
105        chars: Vec<char>,
106        mode: EncodingMode,
107        padding: Option<char>,
108        start_codepoint: Option<u32>,
109    ) -> Result<Self, String> {
110        // ByteRange mode doesn't need chars, just validates start_codepoint
111        if mode == EncodingMode::ByteRange {
112            if let Some(start) = start_codepoint {
113                // Validate that we can represent all 256 bytes
114                if let Some(end_codepoint) = start.checked_add(255) {
115                    if std::char::from_u32(end_codepoint).is_none() {
116                        return Err(format!(
117                            "Invalid Unicode range: {}-{}",
118                            start, end_codepoint
119                        ));
120                    }
121                    // Validate all codepoints in range are valid Unicode
122                    for offset in 0..=255 {
123                        if std::char::from_u32(start + offset).is_none() {
124                            return Err(format!(
125                                "Invalid Unicode codepoint in range: {}",
126                                start + offset
127                            ));
128                        }
129                    }
130                } else {
131                    return Err("Start codepoint too high for 256-byte range".to_string());
132                }
133
134                return Ok(Dictionary {
135                    chars: Vec::new(),
136                    char_to_index: HashMap::new(),
137                    lookup_table: None,
138                    mode,
139                    padding,
140                    start_codepoint: Some(start),
141                });
142            } else {
143                return Err("ByteRange mode requires start_codepoint".to_string());
144            }
145        }
146
147        if chars.is_empty() {
148            return Err("Dictionary cannot be empty".to_string());
149        }
150
151        // Validate dictionary size for chunked mode
152        if mode == EncodingMode::Chunked {
153            let base = chars.len();
154            if !base.is_power_of_two() {
155                return Err(format!(
156                    "Chunked mode requires power-of-two dictionary size, got {}",
157                    base
158                ));
159            }
160            // Additional check: ensure we have valid sizes for chunked mode
161            if base != 2
162                && base != 4
163                && base != 8
164                && base != 16
165                && base != 32
166                && base != 64
167                && base != 128
168                && base != 256
169            {
170                return Err(format!(
171                    "Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
172                    base
173                ));
174            }
175        }
176
177        // Validate character properties
178        let mut char_to_index = HashMap::new();
179        for (i, &c) in chars.iter().enumerate() {
180            // Check for duplicate characters
181            if char_to_index.insert(c, i).is_some() {
182                return Err(format!(
183                    "Duplicate character in dictionary: '{}' (U+{:04X})",
184                    c, c as u32
185                ));
186            }
187
188            // Check for invalid Unicode characters
189            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
190                return Err(format!(
191                    "Control character not allowed in dictionary: U+{:04X}",
192                    c as u32
193                ));
194            }
195
196            // Check for whitespace (allow space for RFC-compliant encodings like Base45)
197            if c.is_whitespace() && c != ' ' {
198                return Err(format!(
199                    "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
200                    c, c as u32
201                ));
202            }
203        }
204
205        // Validate padding character if present
206        if let Some(pad) = padding {
207            if char_to_index.contains_key(&pad) {
208                return Err(format!(
209                    "Padding character '{}' conflicts with dictionary characters",
210                    pad
211                ));
212            }
213            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
214                return Err(format!(
215                    "Control character not allowed as padding: U+{:04X}",
216                    pad as u32
217                ));
218            }
219        }
220
221        // Build fast lookup table for ASCII characters
222        let lookup_table = if chars
223            .iter()
224            .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
225        {
226            let mut table = Box::new([None; 256]);
227            for (i, &c) in chars.iter().enumerate() {
228                table[c as usize] = Some(i);
229            }
230            Some(table)
231        } else {
232            None
233        };
234
235        Ok(Dictionary {
236            chars,
237            char_to_index,
238            lookup_table,
239            mode,
240            padding,
241            start_codepoint: None,
242        })
243    }
244
245    /// Creates an dictionary from a string of characters.
246    ///
247    /// # Arguments
248    ///
249    /// * `s` - String containing the dictionary characters
250    ///
251    /// # Deprecated
252    ///
253    /// Use `Dictionary::builder().chars_from_str(s).build()` instead.
254    #[deprecated(
255        since = "0.1.0",
256        note = "Use Dictionary::builder().chars_from_str(s).build() instead"
257    )]
258    #[allow(deprecated, clippy::should_implement_trait)]
259    pub fn from_str(s: &str) -> Result<Self, String> {
260        let chars: Vec<char> = s.chars().collect();
261        Self::new(chars)
262    }
263
264    /// Returns the base (radix) of the dictionary.
265    ///
266    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
267    pub fn base(&self) -> usize {
268        match self.mode {
269            EncodingMode::ByteRange => 256,
270            _ => self.chars.len(),
271        }
272    }
273
274    /// Returns the encoding mode of this dictionary.
275    pub fn mode(&self) -> &EncodingMode {
276        &self.mode
277    }
278
279    /// Returns the padding character, if any.
280    pub fn padding(&self) -> Option<char> {
281        self.padding
282    }
283
284    /// Returns the starting Unicode codepoint for ByteRange mode.
285    pub fn start_codepoint(&self) -> Option<u32> {
286        self.start_codepoint
287    }
288
289    /// Encodes a digit (0 to base-1) as a character.
290    ///
291    /// Returns `None` if the digit is out of range.
292    pub fn encode_digit(&self, digit: usize) -> Option<char> {
293        match self.mode {
294            EncodingMode::ByteRange => {
295                if let Some(start) = self.start_codepoint
296                    && digit < 256
297                {
298                    return std::char::from_u32(start + digit as u32);
299                }
300                None
301            }
302            _ => self.chars.get(digit).copied(),
303        }
304    }
305
306    /// Decodes a character back to its digit value.
307    ///
308    /// Returns `None` if the character is not in the dictionary.
309    pub fn decode_char(&self, c: char) -> Option<usize> {
310        match self.mode {
311            EncodingMode::ByteRange => {
312                if let Some(start) = self.start_codepoint {
313                    let codepoint = c as u32;
314                    if codepoint >= start && codepoint < start + 256 {
315                        return Some((codepoint - start) as usize);
316                    }
317                }
318                None
319            }
320            _ => {
321                // Use fast lookup table for ASCII characters
322                if let Some(ref table) = self.lookup_table {
323                    let char_val = c as u32;
324                    if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
325                        return table[char_val as usize];
326                    }
327                }
328                // Fall back to HashMap for non-ASCII
329                self.char_to_index.get(&c).copied()
330            }
331        }
332    }
333
334    /// Returns SIMD metadata for this dictionary.
335    ///
336    /// This provides information about whether SIMD acceleration is available
337    /// for this dictionary and which implementation to use.
338    #[cfg(feature = "simd")]
339    pub fn simd_metadata(&self) -> DictionaryMetadata {
340        DictionaryMetadata::from_dictionary(self)
341    }
342
343    /// Returns whether SIMD acceleration is available for this dictionary.
344    ///
345    /// This is a convenience method that checks if SIMD can be used with
346    /// the current CPU features and dictionary configuration.
347    #[cfg(feature = "simd")]
348    pub fn simd_available(&self) -> bool {
349        self.simd_metadata().simd_available()
350    }
351
352    /// Returns whether SIMD acceleration is available for this dictionary.
353    ///
354    /// When the `simd` feature is disabled, this always returns `false`.
355    #[cfg(not(feature = "simd"))]
356    pub fn simd_available(&self) -> bool {
357        false
358    }
359}
360
361/// Builder for constructing a Dictionary with flexible configuration.
362///
363/// # Example
364///
365/// ```
366/// use base_d::{Dictionary, EncodingMode};
367/// let dict = Dictionary::builder()
368///     .chars_from_str("0123456789ABCDEF")
369///     .mode(EncodingMode::Radix)
370///     .build()
371///     .unwrap();
372/// ```
373#[derive(Debug, Default)]
374pub struct DictionaryBuilder {
375    chars: Option<Vec<char>>,
376    mode: Option<EncodingMode>,
377    padding: Option<char>,
378    start_codepoint: Option<u32>,
379}
380
381impl DictionaryBuilder {
382    /// Creates a new DictionaryBuilder with default settings.
383    pub fn new() -> Self {
384        Self {
385            chars: None,
386            mode: None,
387            padding: None,
388            start_codepoint: None,
389        }
390    }
391
392    /// Sets the dictionary characters from a vector.
393    ///
394    /// # Arguments
395    ///
396    /// * `chars` - Vector of characters to use in the dictionary
397    pub fn chars(mut self, chars: Vec<char>) -> Self {
398        self.chars = Some(chars);
399        self
400    }
401
402    /// Sets the dictionary characters from a string.
403    ///
404    /// # Arguments
405    ///
406    /// * `s` - String containing the dictionary characters
407    pub fn chars_from_str(mut self, s: &str) -> Self {
408        self.chars = Some(s.chars().collect());
409        self
410    }
411
412    /// Sets the encoding mode.
413    ///
414    /// # Arguments
415    ///
416    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
417    pub fn mode(mut self, mode: EncodingMode) -> Self {
418        self.mode = Some(mode);
419        self
420    }
421
422    /// Sets the padding character.
423    ///
424    /// # Arguments
425    ///
426    /// * `padding` - Padding character (typically '=' for RFC modes)
427    pub fn padding(mut self, padding: char) -> Self {
428        self.padding = Some(padding);
429        self
430    }
431
432    /// Sets the starting Unicode codepoint for ByteRange mode.
433    ///
434    /// # Arguments
435    ///
436    /// * `start_codepoint` - Starting Unicode codepoint
437    pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
438        self.start_codepoint = Some(start_codepoint);
439        self
440    }
441
442    /// Builds the Dictionary with the configured settings.
443    ///
444    /// # Errors
445    ///
446    /// Returns an error if:
447    /// - The configuration is invalid for the specified mode
448    /// - Required fields are missing
449    /// - Validation fails (duplicates, invalid characters, etc.)
450    #[allow(deprecated)]
451    pub fn build(self) -> Result<Dictionary, String> {
452        let mode = self.mode.unwrap_or(EncodingMode::Radix);
453        let chars = self.chars.unwrap_or_default();
454
455        Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
456    }
457}
458
459#[cfg(test)]
460mod tests {
461    use super::*;
462
463    #[test]
464    fn test_duplicate_character_detection() {
465        let chars = vec!['a', 'b', 'c', 'a'];
466        let result = Dictionary::builder().chars(chars).build();
467        assert!(result.is_err());
468        assert!(result.unwrap_err().contains("Duplicate character"));
469    }
470
471    #[test]
472    fn test_empty_dictionary() {
473        let chars = vec![];
474        let result = Dictionary::builder().chars(chars).build();
475        assert!(result.is_err());
476        assert!(result.unwrap_err().contains("cannot be empty"));
477    }
478
479    #[test]
480    fn test_chunked_mode_power_of_two() {
481        let chars = vec!['a', 'b', 'c']; // 3 is not power of 2
482        let result = Dictionary::builder()
483            .chars(chars)
484            .mode(EncodingMode::Chunked)
485            .build();
486        assert!(result.is_err());
487        assert!(result.unwrap_err().contains("power-of-two"));
488    }
489
490    #[test]
491    fn test_chunked_mode_valid_sizes() {
492        // Test all valid chunked sizes
493        for &size in &[2, 4, 8, 16, 32, 64] {
494            let chars: Vec<char> = (0..size)
495                .map(|i| {
496                    // Use a wider range of Unicode characters
497                    char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
498                })
499                .collect();
500            let result = Dictionary::builder()
501                .chars(chars)
502                .mode(EncodingMode::Chunked)
503                .build();
504            assert!(result.is_ok(), "Size {} should be valid", size);
505        }
506    }
507
508    #[test]
509    fn test_control_character_rejection() {
510        let chars = vec!['a', 'b', '\x00', 'c']; // null character
511        let result = Dictionary::builder().chars(chars).build();
512        assert!(result.is_err());
513        assert!(result.unwrap_err().contains("Control character"));
514    }
515
516    #[test]
517    fn test_whitespace_rejection() {
518        // Tab should be rejected (only space is allowed for RFC encodings like Base45)
519        let chars = vec!['a', 'b', '\t', 'c'];
520        let result = Dictionary::builder().chars(chars).build();
521        assert!(result.is_err());
522        assert!(result.unwrap_err().contains("Whitespace"));
523
524        // But space should be allowed (for Base45 RFC 9285 compliance)
525        let chars_with_space = vec!['a', 'b', ' ', 'c'];
526        let result_space = Dictionary::builder().chars(chars_with_space).build();
527        assert!(result_space.is_ok());
528    }
529
530    #[test]
531    fn test_padding_conflict_with_dictionary() {
532        let chars = vec!['a', 'b', 'c', 'd'];
533        let result = Dictionary::builder()
534            .chars(chars)
535            .mode(EncodingMode::Radix)
536            .padding('b')
537            .build();
538        assert!(result.is_err());
539        let err = result.unwrap_err();
540        assert!(err.contains("Padding character"));
541        assert!(err.contains("conflicts"));
542    }
543
544    #[test]
545    fn test_valid_padding() {
546        let chars = vec!['a', 'b', 'c', 'd'];
547        let result = Dictionary::builder()
548            .chars(chars)
549            .mode(EncodingMode::Radix)
550            .padding('=')
551            .build();
552        assert!(result.is_ok());
553    }
554
555    #[test]
556    fn test_byte_range_exceeds_unicode() {
557        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
558        let result = Dictionary::builder()
559            .mode(EncodingMode::ByteRange)
560            .start_codepoint(0x10FF80) // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
561            .build();
562        assert!(result.is_err());
563    }
564
565    #[test]
566    fn test_byte_range_valid_start() {
567        let result = Dictionary::builder()
568            .mode(EncodingMode::ByteRange)
569            .start_codepoint(0x1F300) // Valid start in emoji range
570            .build();
571        assert!(result.is_ok());
572    }
573
574    #[test]
575    fn test_byte_range_no_start_codepoint() {
576        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
577        assert!(result.is_err());
578        assert!(result.unwrap_err().contains("requires start_codepoint"));
579    }
580
581    #[test]
582    fn test_detailed_error_messages() {
583        // Test that error messages include useful information
584        let chars = vec!['a', 'b', 'a'];
585        let err = Dictionary::builder().chars(chars).build().unwrap_err();
586        assert!(err.contains("'a'") || err.contains("U+"));
587    }
588
589    // DictionaryBuilder tests
590    #[test]
591    fn test_builder_basic() {
592        let dict = Dictionary::builder()
593            .chars(vec!['0', '1', '2', '3'])
594            .build()
595            .unwrap();
596
597        assert_eq!(dict.base(), 4);
598        assert_eq!(dict.mode(), &EncodingMode::Radix);
599        assert_eq!(dict.padding(), None);
600    }
601
602    #[test]
603    fn test_builder_from_str() {
604        let dict = Dictionary::builder()
605            .chars_from_str("0123456789ABCDEF")
606            .build()
607            .unwrap();
608
609        assert_eq!(dict.base(), 16);
610    }
611
612    #[test]
613    fn test_builder_with_mode() {
614        let dict = Dictionary::builder()
615            .chars(vec!['0', '1'])
616            .mode(EncodingMode::Chunked)
617            .build()
618            .unwrap();
619
620        assert_eq!(dict.mode(), &EncodingMode::Chunked);
621    }
622
623    #[test]
624    fn test_builder_with_padding() {
625        let dict = Dictionary::builder()
626            .chars_from_str("ABCD")
627            .padding('=')
628            .build()
629            .unwrap();
630
631        assert_eq!(dict.padding(), Some('='));
632    }
633
634    #[test]
635    fn test_builder_byte_range() {
636        let dict = Dictionary::builder()
637            .mode(EncodingMode::ByteRange)
638            .start_codepoint(0x1F300)
639            .build()
640            .unwrap();
641
642        assert_eq!(dict.mode(), &EncodingMode::ByteRange);
643        assert_eq!(dict.start_codepoint(), Some(0x1F300));
644        assert_eq!(dict.base(), 256);
645    }
646
647    #[test]
648    fn test_builder_byte_range_missing_start() {
649        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
650
651        assert!(result.is_err());
652        assert!(result.unwrap_err().contains("requires start_codepoint"));
653    }
654
655    #[test]
656    fn test_builder_validation_duplicates() {
657        let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
658
659        assert!(result.is_err());
660        assert!(result.unwrap_err().contains("Duplicate character"));
661    }
662
663    #[test]
664    fn test_builder_chunked_validation() {
665        let result = Dictionary::builder()
666            .chars(vec!['a', 'b', 'c']) // 3 is not power of 2
667            .mode(EncodingMode::Chunked)
668            .build();
669
670        assert!(result.is_err());
671        assert!(result.unwrap_err().contains("power-of-two"));
672    }
673
674    #[test]
675    fn test_builder_padding_conflict() {
676        let result = Dictionary::builder()
677            .chars(vec!['a', 'b', 'c'])
678            .padding('b')
679            .build();
680
681        assert!(result.is_err());
682        assert!(result.unwrap_err().contains("Padding character"));
683    }
684
685    #[test]
686    fn test_builder_full_config() {
687        let dict = Dictionary::builder()
688            .chars_from_str("01")
689            .mode(EncodingMode::Chunked)
690            .padding('=')
691            .build()
692            .unwrap();
693
694        assert_eq!(dict.base(), 2);
695        assert_eq!(dict.mode(), &EncodingMode::Chunked);
696        assert_eq!(dict.padding(), Some('='));
697    }
698}