base_d/
alphabet.rs

1use std::collections::HashMap;
2use crate::config::EncodingMode;
3
4const MAX_LOOKUP_TABLE_SIZE: usize = 256;
5
6/// Represents an encoding alphabet with its characters and configuration.
7///
8/// An alphabet defines the character set and encoding mode used for converting
9/// binary data to text. Supports three modes: mathematical base conversion,
10/// chunked (RFC 4648), and byte-range mapping.
11#[derive(Debug, Clone)]
12pub struct Alphabet {
13    chars: Vec<char>,
14    char_to_index: HashMap<char, usize>,
15    // Fast lookup table for ASCII/extended ASCII characters
16    lookup_table: Option<Box<[Option<usize>; 256]>>,
17    mode: EncodingMode,
18    padding: Option<char>,
19    start_codepoint: Option<u32>,
20}
21
22impl Alphabet {
23    /// Creates a new alphabet with default settings (BaseConversion mode, no padding).
24    ///
25    /// # Arguments
26    ///
27    /// * `chars` - Vector of characters to use in the alphabet
28    ///
29    /// # Errors
30    ///
31    /// Returns an error if the alphabet is empty or contains duplicate characters.
32    pub fn new(chars: Vec<char>) -> Result<Self, String> {
33        Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
34    }
35    
36    /// Creates a new alphabet with specified encoding mode and optional padding.
37    ///
38    /// # Arguments
39    ///
40    /// * `chars` - Vector of characters to use in the alphabet
41    /// * `mode` - Encoding mode (BaseConversion, Chunked, or ByteRange)
42    /// * `padding` - Optional padding character (typically '=' for RFC modes)
43    ///
44    /// # Errors
45    ///
46    /// Returns an error if:
47    /// - The alphabet is empty or contains duplicates
48    /// - Chunked mode is used with a non-power-of-two alphabet size
49    pub fn new_with_mode(chars: Vec<char>, mode: EncodingMode, padding: Option<char>) -> Result<Self, String> {
50        Self::new_with_mode_and_range(chars, mode, padding, None)
51    }
52    
53    /// Creates a new alphabet with full configuration including byte-range support.
54    ///
55    /// # Arguments
56    ///
57    /// * `chars` - Vector of characters (empty for ByteRange mode)
58    /// * `mode` - Encoding mode
59    /// * `padding` - Optional padding character
60    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
61    ///
62    /// # Errors
63    ///
64    /// Returns an error if configuration is invalid for the specified mode.
65    pub fn new_with_mode_and_range(chars: Vec<char>, mode: EncodingMode, padding: Option<char>, start_codepoint: Option<u32>) -> Result<Self, String> {
66        // ByteRange mode doesn't need chars, just validates start_codepoint
67        if mode == EncodingMode::ByteRange {
68            if let Some(start) = start_codepoint {
69                // Validate that we can represent all 256 bytes
70                if let Some(end_codepoint) = start.checked_add(255) {
71                    if std::char::from_u32(end_codepoint).is_none() {
72                        return Err(format!("Invalid Unicode range: {}-{}", start, end_codepoint));
73                    }
74                    // Validate all codepoints in range are valid Unicode
75                    for offset in 0..=255 {
76                        if std::char::from_u32(start + offset).is_none() {
77                            return Err(format!("Invalid Unicode codepoint in range: {}", start + offset));
78                        }
79                    }
80                } else {
81                    return Err("Start codepoint too high for 256-byte range".to_string());
82                }
83                
84                return Ok(Alphabet {
85                    chars: Vec::new(),
86                    char_to_index: HashMap::new(),
87                    lookup_table: None,
88                    mode,
89                    padding,
90                    start_codepoint: Some(start),
91                });
92            } else {
93                return Err("ByteRange mode requires start_codepoint".to_string());
94            }
95        }
96        
97        if chars.is_empty() {
98            return Err("Alphabet cannot be empty".to_string());
99        }
100        
101        // Validate alphabet size for chunked mode
102        if mode == EncodingMode::Chunked {
103            let base = chars.len();
104            if !base.is_power_of_two() {
105                return Err(format!("Chunked mode requires power-of-two alphabet size, got {}", base));
106            }
107            // Additional check: ensure we have valid sizes for chunked mode
108            if base != 2 && base != 4 && base != 8 && base != 16 && base != 32 && base != 64 && base != 128 && base != 256 {
109                return Err(format!("Chunked mode requires alphabet size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
110            }
111        }
112        
113        // Validate character properties
114        let mut char_to_index = HashMap::new();
115        for (i, &c) in chars.iter().enumerate() {
116            // Check for duplicate characters
117            if char_to_index.insert(c, i).is_some() {
118                return Err(format!("Duplicate character in alphabet: '{}' (U+{:04X})", c, c as u32));
119            }
120            
121            // Check for invalid Unicode characters
122            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
123                return Err(format!("Control character not allowed in alphabet: U+{:04X}", c as u32));
124            }
125            
126            // Check for whitespace (except in specific cases)
127            if c.is_whitespace() {
128                return Err(format!("Whitespace character not allowed in alphabet: '{}' (U+{:04X})", c, c as u32));
129            }
130        }
131        
132        // Validate padding character if present
133        if let Some(pad) = padding {
134            if char_to_index.contains_key(&pad) {
135                return Err(format!("Padding character '{}' conflicts with alphabet characters", pad));
136            }
137            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
138                return Err(format!("Control character not allowed as padding: U+{:04X}", pad as u32));
139            }
140        }
141        
142        // Build fast lookup table for ASCII characters
143        let lookup_table = if chars.iter().all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32) {
144            let mut table = Box::new([None; 256]);
145            for (i, &c) in chars.iter().enumerate() {
146                table[c as usize] = Some(i);
147            }
148            Some(table)
149        } else {
150            None
151        };
152        
153        Ok(Alphabet {
154            chars,
155            char_to_index,
156            lookup_table,
157            mode,
158            padding,
159            start_codepoint: None,
160        })
161    }
162    
163    /// Creates an alphabet from a string of characters.
164    ///
165    /// # Arguments
166    ///
167    /// * `s` - String containing the alphabet characters
168    pub fn from_str(s: &str) -> Result<Self, String> {
169        let chars: Vec<char> = s.chars().collect();
170        Self::new(chars)
171    }
172    
173    /// Returns the base (radix) of the alphabet.
174    ///
175    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
176    pub fn base(&self) -> usize {
177        match self.mode {
178            EncodingMode::ByteRange => 256,
179            _ => self.chars.len(),
180        }
181    }
182    
183    /// Returns the encoding mode of this alphabet.
184    pub fn mode(&self) -> &EncodingMode {
185        &self.mode
186    }
187    
188    /// Returns the padding character, if any.
189    pub fn padding(&self) -> Option<char> {
190        self.padding
191    }
192    
193    /// Returns the starting Unicode codepoint for ByteRange mode.
194    pub fn start_codepoint(&self) -> Option<u32> {
195        self.start_codepoint
196    }
197    
198    /// Encodes a digit (0 to base-1) as a character.
199    ///
200    /// Returns `None` if the digit is out of range.
201    pub fn encode_digit(&self, digit: usize) -> Option<char> {
202        match self.mode {
203            EncodingMode::ByteRange => {
204                if let Some(start) = self.start_codepoint {
205                    if digit < 256 {
206                        return std::char::from_u32(start + digit as u32);
207                    }
208                }
209                None
210            }
211            _ => self.chars.get(digit).copied(),
212        }
213    }
214    
215    /// Decodes a character back to its digit value.
216    ///
217    /// Returns `None` if the character is not in the alphabet.
218    pub fn decode_char(&self, c: char) -> Option<usize> {
219        match self.mode {
220            EncodingMode::ByteRange => {
221                if let Some(start) = self.start_codepoint {
222                    let codepoint = c as u32;
223                    if codepoint >= start && codepoint < start + 256 {
224                        return Some((codepoint - start) as usize);
225                    }
226                }
227                None
228            }
229            _ => {
230                // Use fast lookup table for ASCII characters
231                if let Some(ref table) = self.lookup_table {
232                    let char_val = c as u32;
233                    if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
234                        return table[char_val as usize];
235                    }
236                }
237                // Fall back to HashMap for non-ASCII
238                self.char_to_index.get(&c).copied()
239            }
240        }
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247    
248    #[test]
249    fn test_duplicate_character_detection() {
250        let chars = vec!['a', 'b', 'c', 'a'];
251        let result = Alphabet::new(chars);
252        assert!(result.is_err());
253        assert!(result.unwrap_err().contains("Duplicate character"));
254    }
255    
256    #[test]
257    fn test_empty_alphabet() {
258        let chars = vec![];
259        let result = Alphabet::new(chars);
260        assert!(result.is_err());
261        assert!(result.unwrap_err().contains("cannot be empty"));
262    }
263    
264    #[test]
265    fn test_chunked_mode_power_of_two() {
266        let chars = vec!['a', 'b', 'c'];  // 3 is not power of 2
267        let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
268        assert!(result.is_err());
269        assert!(result.unwrap_err().contains("power-of-two"));
270    }
271    
272    #[test]
273    fn test_chunked_mode_valid_sizes() {
274        // Test all valid chunked sizes
275        for &size in &[2, 4, 8, 16, 32, 64] {
276            let chars: Vec<char> = (0..size).map(|i| {
277                // Use a wider range of Unicode characters
278                char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
279            }).collect();
280            let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
281            assert!(result.is_ok(), "Size {} should be valid", size);
282        }
283    }
284    
285    #[test]
286    fn test_control_character_rejection() {
287        let chars = vec!['a', 'b', '\x00', 'c'];  // null character
288        let result = Alphabet::new(chars);
289        assert!(result.is_err());
290        assert!(result.unwrap_err().contains("Control character"));
291    }
292    
293    #[test]
294    fn test_whitespace_rejection() {
295        let chars = vec!['a', 'b', ' ', 'c'];
296        let result = Alphabet::new(chars);
297        assert!(result.is_err());
298        assert!(result.unwrap_err().contains("Whitespace"));
299    }
300    
301    #[test]
302    fn test_padding_conflict_with_alphabet() {
303        let chars = vec!['a', 'b', 'c', 'd'];
304        let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
305        assert!(result.is_err());
306        let err = result.unwrap_err();
307        assert!(err.contains("Padding character"));
308        assert!(err.contains("conflicts"));
309    }
310    
311    #[test]
312    fn test_valid_padding() {
313        let chars = vec!['a', 'b', 'c', 'd'];
314        let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
315        assert!(result.is_ok());
316    }
317    
318    #[test]
319    fn test_byte_range_exceeds_unicode() {
320        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
321        let result = Alphabet::new_with_mode_and_range(
322            Vec::new(),
323            EncodingMode::ByteRange,
324            None,
325            Some(0x10FF80)  // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
326        );
327        assert!(result.is_err());
328    }
329    
330    #[test]
331    fn test_byte_range_valid_start() {
332        let result = Alphabet::new_with_mode_and_range(
333            Vec::new(),
334            EncodingMode::ByteRange,
335            None,
336            Some(0x1F300)  // Valid start in emoji range
337        );
338        assert!(result.is_ok());
339    }
340    
341    #[test]
342    fn test_byte_range_no_start_codepoint() {
343        let result = Alphabet::new_with_mode_and_range(
344            Vec::new(),
345            EncodingMode::ByteRange,
346            None,
347            None
348        );
349        assert!(result.is_err());
350        assert!(result.unwrap_err().contains("requires start_codepoint"));
351    }
352    
353    #[test]
354    fn test_detailed_error_messages() {
355        // Test that error messages include useful information
356        let chars = vec!['a', 'b', 'a'];
357        let err = Alphabet::new(chars).unwrap_err();
358        assert!(err.contains("'a'") || err.contains("U+"));
359    }
360}