base_d/
alphabet.rs

1use std::collections::HashMap;
2use crate::config::EncodingMode;
3
4/// Represents an encoding alphabet with its characters and configuration.
5///
6/// An alphabet defines the character set and encoding mode used for converting
7/// binary data to text. Supports three modes: mathematical base conversion,
8/// chunked (RFC 4648), and byte-range mapping.
9#[derive(Debug, Clone)]
10pub struct Alphabet {
11    chars: Vec<char>,
12    char_to_index: HashMap<char, usize>,
13    mode: EncodingMode,
14    padding: Option<char>,
15    start_codepoint: Option<u32>,
16}
17
18impl Alphabet {
19    /// Creates a new alphabet with default settings (BaseConversion mode, no padding).
20    ///
21    /// # Arguments
22    ///
23    /// * `chars` - Vector of characters to use in the alphabet
24    ///
25    /// # Errors
26    ///
27    /// Returns an error if the alphabet is empty or contains duplicate characters.
28    pub fn new(chars: Vec<char>) -> Result<Self, String> {
29        Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
30    }
31    
32    /// Creates a new alphabet with specified encoding mode and optional padding.
33    ///
34    /// # Arguments
35    ///
36    /// * `chars` - Vector of characters to use in the alphabet
37    /// * `mode` - Encoding mode (BaseConversion, Chunked, or ByteRange)
38    /// * `padding` - Optional padding character (typically '=' for RFC modes)
39    ///
40    /// # Errors
41    ///
42    /// Returns an error if:
43    /// - The alphabet is empty or contains duplicates
44    /// - Chunked mode is used with a non-power-of-two alphabet size
45    pub fn new_with_mode(chars: Vec<char>, mode: EncodingMode, padding: Option<char>) -> Result<Self, String> {
46        Self::new_with_mode_and_range(chars, mode, padding, None)
47    }
48    
49    /// Creates a new alphabet with full configuration including byte-range support.
50    ///
51    /// # Arguments
52    ///
53    /// * `chars` - Vector of characters (empty for ByteRange mode)
54    /// * `mode` - Encoding mode
55    /// * `padding` - Optional padding character
56    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
57    ///
58    /// # Errors
59    ///
60    /// Returns an error if configuration is invalid for the specified mode.
61    pub fn new_with_mode_and_range(chars: Vec<char>, mode: EncodingMode, padding: Option<char>, start_codepoint: Option<u32>) -> Result<Self, String> {
62        // ByteRange mode doesn't need chars, just validates start_codepoint
63        if mode == EncodingMode::ByteRange {
64            if let Some(start) = start_codepoint {
65                // Validate that we can represent all 256 bytes
66                if let Some(end_codepoint) = start.checked_add(255) {
67                    if std::char::from_u32(end_codepoint).is_none() {
68                        return Err(format!("Invalid Unicode range: {}-{}", start, end_codepoint));
69                    }
70                    // Validate all codepoints in range are valid Unicode
71                    for offset in 0..=255 {
72                        if std::char::from_u32(start + offset).is_none() {
73                            return Err(format!("Invalid Unicode codepoint in range: {}", start + offset));
74                        }
75                    }
76                } else {
77                    return Err("Start codepoint too high for 256-byte range".to_string());
78                }
79                
80                return Ok(Alphabet {
81                    chars: Vec::new(),
82                    char_to_index: HashMap::new(),
83                    mode,
84                    padding,
85                    start_codepoint: Some(start),
86                });
87            } else {
88                return Err("ByteRange mode requires start_codepoint".to_string());
89            }
90        }
91        
92        if chars.is_empty() {
93            return Err("Alphabet cannot be empty".to_string());
94        }
95        
96        // Validate alphabet size for chunked mode
97        if mode == EncodingMode::Chunked {
98            let base = chars.len();
99            if !base.is_power_of_two() {
100                return Err(format!("Chunked mode requires power-of-two alphabet size, got {}", base));
101            }
102            // Additional check: ensure we have valid sizes for chunked mode
103            if base != 2 && base != 4 && base != 8 && base != 16 && base != 32 && base != 64 && base != 128 && base != 256 {
104                return Err(format!("Chunked mode requires alphabet size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
105            }
106        }
107        
108        // Validate character properties
109        let mut char_to_index = HashMap::new();
110        for (i, &c) in chars.iter().enumerate() {
111            // Check for duplicate characters
112            if char_to_index.insert(c, i).is_some() {
113                return Err(format!("Duplicate character in alphabet: '{}' (U+{:04X})", c, c as u32));
114            }
115            
116            // Check for invalid Unicode characters
117            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
118                return Err(format!("Control character not allowed in alphabet: U+{:04X}", c as u32));
119            }
120            
121            // Check for whitespace (except in specific cases)
122            if c.is_whitespace() {
123                return Err(format!("Whitespace character not allowed in alphabet: '{}' (U+{:04X})", c, c as u32));
124            }
125        }
126        
127        // Validate padding character if present
128        if let Some(pad) = padding {
129            if char_to_index.contains_key(&pad) {
130                return Err(format!("Padding character '{}' conflicts with alphabet characters", pad));
131            }
132            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
133                return Err(format!("Control character not allowed as padding: U+{:04X}", pad as u32));
134            }
135        }
136        
137        Ok(Alphabet {
138            chars,
139            char_to_index,
140            mode,
141            padding,
142            start_codepoint: None,
143        })
144    }
145    
146    /// Creates an alphabet from a string of characters.
147    ///
148    /// # Arguments
149    ///
150    /// * `s` - String containing the alphabet characters
151    pub fn from_str(s: &str) -> Result<Self, String> {
152        let chars: Vec<char> = s.chars().collect();
153        Self::new(chars)
154    }
155    
156    /// Returns the base (radix) of the alphabet.
157    ///
158    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
159    pub fn base(&self) -> usize {
160        match self.mode {
161            EncodingMode::ByteRange => 256,
162            _ => self.chars.len(),
163        }
164    }
165    
166    /// Returns the encoding mode of this alphabet.
167    pub fn mode(&self) -> &EncodingMode {
168        &self.mode
169    }
170    
171    /// Returns the padding character, if any.
172    pub fn padding(&self) -> Option<char> {
173        self.padding
174    }
175    
176    /// Returns the starting Unicode codepoint for ByteRange mode.
177    pub fn start_codepoint(&self) -> Option<u32> {
178        self.start_codepoint
179    }
180    
181    /// Encodes a digit (0 to base-1) as a character.
182    ///
183    /// Returns `None` if the digit is out of range.
184    pub fn encode_digit(&self, digit: usize) -> Option<char> {
185        match self.mode {
186            EncodingMode::ByteRange => {
187                if let Some(start) = self.start_codepoint {
188                    if digit < 256 {
189                        return std::char::from_u32(start + digit as u32);
190                    }
191                }
192                None
193            }
194            _ => self.chars.get(digit).copied(),
195        }
196    }
197    
198    /// Decodes a character back to its digit value.
199    ///
200    /// Returns `None` if the character is not in the alphabet.
201    pub fn decode_char(&self, c: char) -> Option<usize> {
202        match self.mode {
203            EncodingMode::ByteRange => {
204                if let Some(start) = self.start_codepoint {
205                    let codepoint = c as u32;
206                    if codepoint >= start && codepoint < start + 256 {
207                        return Some((codepoint - start) as usize);
208                    }
209                }
210                None
211            }
212            _ => self.char_to_index.get(&c).copied(),
213        }
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220    
221    #[test]
222    fn test_duplicate_character_detection() {
223        let chars = vec!['a', 'b', 'c', 'a'];
224        let result = Alphabet::new(chars);
225        assert!(result.is_err());
226        assert!(result.unwrap_err().contains("Duplicate character"));
227    }
228    
229    #[test]
230    fn test_empty_alphabet() {
231        let chars = vec![];
232        let result = Alphabet::new(chars);
233        assert!(result.is_err());
234        assert!(result.unwrap_err().contains("cannot be empty"));
235    }
236    
237    #[test]
238    fn test_chunked_mode_power_of_two() {
239        let chars = vec!['a', 'b', 'c'];  // 3 is not power of 2
240        let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
241        assert!(result.is_err());
242        assert!(result.unwrap_err().contains("power-of-two"));
243    }
244    
245    #[test]
246    fn test_chunked_mode_valid_sizes() {
247        // Test all valid chunked sizes
248        for &size in &[2, 4, 8, 16, 32, 64] {
249            let chars: Vec<char> = (0..size).map(|i| {
250                // Use a wider range of Unicode characters
251                char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
252            }).collect();
253            let result = Alphabet::new_with_mode(chars, EncodingMode::Chunked, None);
254            assert!(result.is_ok(), "Size {} should be valid", size);
255        }
256    }
257    
258    #[test]
259    fn test_control_character_rejection() {
260        let chars = vec!['a', 'b', '\x00', 'c'];  // null character
261        let result = Alphabet::new(chars);
262        assert!(result.is_err());
263        assert!(result.unwrap_err().contains("Control character"));
264    }
265    
266    #[test]
267    fn test_whitespace_rejection() {
268        let chars = vec!['a', 'b', ' ', 'c'];
269        let result = Alphabet::new(chars);
270        assert!(result.is_err());
271        assert!(result.unwrap_err().contains("Whitespace"));
272    }
273    
274    #[test]
275    fn test_padding_conflict_with_alphabet() {
276        let chars = vec!['a', 'b', 'c', 'd'];
277        let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
278        assert!(result.is_err());
279        let err = result.unwrap_err();
280        assert!(err.contains("Padding character"));
281        assert!(err.contains("conflicts"));
282    }
283    
284    #[test]
285    fn test_valid_padding() {
286        let chars = vec!['a', 'b', 'c', 'd'];
287        let result = Alphabet::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
288        assert!(result.is_ok());
289    }
290    
291    #[test]
292    fn test_byte_range_exceeds_unicode() {
293        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
294        let result = Alphabet::new_with_mode_and_range(
295            Vec::new(),
296            EncodingMode::ByteRange,
297            None,
298            Some(0x10FF80)  // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
299        );
300        assert!(result.is_err());
301    }
302    
303    #[test]
304    fn test_byte_range_valid_start() {
305        let result = Alphabet::new_with_mode_and_range(
306            Vec::new(),
307            EncodingMode::ByteRange,
308            None,
309            Some(0x1F300)  // Valid start in emoji range
310        );
311        assert!(result.is_ok());
312    }
313    
314    #[test]
315    fn test_byte_range_no_start_codepoint() {
316        let result = Alphabet::new_with_mode_and_range(
317            Vec::new(),
318            EncodingMode::ByteRange,
319            None,
320            None
321        );
322        assert!(result.is_err());
323        assert!(result.unwrap_err().contains("requires start_codepoint"));
324    }
325    
326    #[test]
327    fn test_detailed_error_messages() {
328        // Test that error messages include useful information
329        let chars = vec!['a', 'b', 'a'];
330        let err = Alphabet::new(chars).unwrap_err();
331        assert!(err.contains("'a'") || err.contains("U+"));
332    }
333}